From 2c3c1048746a4622d8c89a29670120dc8fab93c4 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:49:45 +0200 Subject: Adding upstream version 6.1.76. Signed-off-by: Daniel Baumann --- mm/Kconfig | 1159 +++++ mm/Kconfig.debug | 210 + mm/Makefile | 140 + mm/backing-dev.c | 1003 +++++ mm/balloon_compaction.c | 258 ++ mm/bootmem_info.c | 128 + mm/cma.c | 588 +++ mm/cma.h | 53 + mm/cma_debug.c | 197 + mm/cma_sysfs.c | 112 + mm/compaction.c | 3068 +++++++++++++ mm/damon/Kconfig | 106 + mm/damon/Makefile | 9 + mm/damon/core-test.h | 313 ++ mm/damon/core.c | 1310 ++++++ mm/damon/dbgfs-test.h | 163 + mm/damon/dbgfs.c | 1114 +++++ mm/damon/lru_sort.c | 340 ++ mm/damon/modules-common.h | 46 + mm/damon/ops-common.c | 135 + mm/damon/ops-common.h | 18 + mm/damon/paddr.c | 319 ++ mm/damon/reclaim.c | 284 ++ mm/damon/sysfs.c | 2909 +++++++++++++ mm/damon/vaddr-test.h | 316 ++ mm/damon/vaddr.c | 715 ++++ mm/debug.c | 262 ++ mm/debug_page_ref.c | 55 + mm/debug_vm_pgtable.c | 1357 ++++++ mm/dmapool.c | 529 +++ mm/early_ioremap.c | 299 ++ mm/fadvise.c | 229 + mm/failslab.c | 73 + mm/filemap.c | 4017 +++++++++++++++++ mm/folio-compat.c | 148 + mm/frontswap.c | 277 ++ mm/gup.c | 3305 ++++++++++++++ mm/gup_test.c | 250 ++ mm/gup_test.h | 33 + mm/highmem.c | 814 ++++ mm/hmm.c | 599 +++ mm/huge_memory.c | 3302 ++++++++++++++ mm/hugetlb.c | 7698 +++++++++++++++++++++++++++++++++ mm/hugetlb_cgroup.c | 919 ++++ mm/hugetlb_vmemmap.c | 577 +++ mm/hugetlb_vmemmap.h | 60 + mm/hwpoison-inject.c | 112 + mm/init-mm.c | 55 + mm/internal.h | 870 ++++ mm/interval_tree.c | 111 + mm/io-mapping.c | 29 + mm/ioremap.c | 61 + mm/kasan/Makefile | 49 + mm/kasan/common.c | 452 ++ mm/kasan/generic.c | 519 +++ mm/kasan/hw_tags.c | 336 ++ mm/kasan/init.c | 491 +++ mm/kasan/kasan.h | 635 +++ mm/kasan/kasan_test.c | 1457 +++++++ mm/kasan/kasan_test_module.c | 141 + mm/kasan/quarantine.c | 438 ++ mm/kasan/report.c | 564 +++ mm/kasan/report_generic.c | 369 ++ mm/kasan/report_hw_tags.c | 38 + mm/kasan/report_sw_tags.c | 69 + mm/kasan/report_tags.c | 116 + mm/kasan/shadow.c | 598 +++ mm/kasan/sw_tags.c | 178 + mm/kasan/tags.c | 144 + mm/kfence/.kunitconfig | 6 + mm/kfence/Makefile | 6 + mm/kfence/core.c | 1164 +++++ mm/kfence/kfence.h | 134 + mm/kfence/kfence_test.c | 870 ++++ mm/kfence/report.c | 327 ++ mm/khugepaged.c | 2738 ++++++++++++ mm/kmemleak.c | 2136 ++++++++++ mm/kmsan/Makefile | 28 + mm/kmsan/core.c | 450 ++ mm/kmsan/hooks.c | 424 ++ mm/kmsan/init.c | 235 + mm/kmsan/instrumentation.c | 308 ++ mm/kmsan/kmsan.h | 211 + mm/kmsan/kmsan_test.c | 582 +++ mm/kmsan/report.c | 219 + mm/kmsan/shadow.c | 308 ++ mm/ksm.c | 3230 ++++++++++++++ mm/list_lru.c | 605 +++ mm/maccess.c | 230 + mm/madvise.c | 1514 +++++++ mm/mapping_dirty_helpers.c | 354 ++ mm/memblock.c | 2175 ++++++++++ mm/memcontrol.c | 7805 +++++++++++++++++++++++++++++++++ mm/memfd.c | 344 ++ mm/memory-failure.c | 2629 ++++++++++++ mm/memory-tiers.c | 732 ++++ mm/memory.c | 6018 ++++++++++++++++++++++++++ mm/memory_hotplug.c | 2282 ++++++++++ mm/mempolicy.c | 3163 ++++++++++++++ mm/mempool.c | 556 +++ mm/memremap.c | 550 +++ mm/memtest.c | 113 + mm/migrate.c | 2237 ++++++++++ mm/migrate_device.c | 975 +++++ mm/mincore.c | 283 ++ mm/mlock.c | 777 ++++ mm/mm_init.c | 206 + mm/mm_slot.h | 55 + mm/mmap.c | 3901 +++++++++++++++++ mm/mmap_lock.c | 246 ++ mm/mmu_gather.c | 367 ++ mm/mmu_notifier.c | 1132 +++++ mm/mmzone.c | 112 + mm/mprotect.c | 875 ++++ mm/mremap.c | 1105 +++++ mm/msync.c | 114 + mm/nommu.c | 1871 ++++++++ mm/oom_kill.c | 1262 ++++++ mm/page-writeback.c | 3084 +++++++++++++ mm/page_alloc.c | 9728 ++++++++++++++++++++++++++++++++++++++++++ mm/page_counter.c | 264 ++ mm/page_ext.c | 529 +++ mm/page_idle.c | 220 + mm/page_io.c | 537 +++ mm/page_isolation.c | 671 +++ mm/page_owner.c | 726 ++++ mm/page_poison.c | 106 + mm/page_reporting.c | 375 ++ mm/page_reporting.h | 53 + mm/page_table_check.c | 254 ++ mm/page_vma_mapped.c | 316 ++ mm/pagewalk.c | 619 +++ mm/percpu-internal.h | 257 ++ mm/percpu-km.c | 130 + mm/percpu-stats.c | 235 + mm/percpu-vm.c | 410 ++ mm/percpu.c | 3461 +++++++++++++++ mm/pgalloc-track.h | 51 + mm/pgtable-generic.c | 231 + mm/process_vm_access.c | 304 ++ mm/ptdump.c | 165 + mm/readahead.c | 851 ++++ mm/rmap.c | 2577 +++++++++++ mm/rodata_test.c | 52 + mm/secretmem.c | 293 ++ mm/shmem.c | 4376 +++++++++++++++++++ mm/shrinker_debug.c | 289 ++ mm/shuffle.c | 182 + mm/shuffle.h | 53 + mm/slab.c | 4053 ++++++++++++++++++ mm/slab.h | 874 ++++ mm/slab_common.c | 1456 +++++++ mm/slob.c | 757 ++++ mm/slub.c | 6310 +++++++++++++++++++++++++++ mm/sparse-vmemmap.c | 398 ++ mm/sparse.c | 934 ++++ mm/swap.c | 1127 +++++ mm/swap.h | 148 + mm/swap_cgroup.c | 231 + mm/swap_slots.c | 350 ++ mm/swap_state.c | 910 ++++ mm/swapfile.c | 3686 ++++++++++++++++ mm/truncate.c | 867 ++++ mm/usercopy.c | 276 ++ mm/userfaultfd.c | 793 ++++ mm/util.c | 1195 ++++++ mm/vmalloc.c | 4218 ++++++++++++++++++ mm/vmpressure.c | 481 +++ mm/vmscan.c | 7793 +++++++++++++++++++++++++++++++++ mm/vmstat.c | 2251 ++++++++++ mm/workingset.c | 748 ++++ mm/z3fold.c | 1710 ++++++++ mm/zbud.c | 640 +++ mm/zpool.c | 399 ++ mm/zsmalloc.c | 2373 +++++++++++ mm/zswap.c | 1565 +++++++ 176 files changed, 183554 insertions(+) create mode 100644 mm/Kconfig create mode 100644 mm/Kconfig.debug create mode 100644 mm/Makefile create mode 100644 mm/backing-dev.c create mode 100644 mm/balloon_compaction.c create mode 100644 mm/bootmem_info.c create mode 100644 mm/cma.c create mode 100644 mm/cma.h create mode 100644 mm/cma_debug.c create mode 100644 mm/cma_sysfs.c create mode 100644 mm/compaction.c create mode 100644 mm/damon/Kconfig create mode 100644 mm/damon/Makefile create mode 100644 mm/damon/core-test.h create mode 100644 mm/damon/core.c create mode 100644 mm/damon/dbgfs-test.h create mode 100644 mm/damon/dbgfs.c create mode 100644 mm/damon/lru_sort.c create mode 100644 mm/damon/modules-common.h create mode 100644 mm/damon/ops-common.c create mode 100644 mm/damon/ops-common.h create mode 100644 mm/damon/paddr.c create mode 100644 mm/damon/reclaim.c create mode 100644 mm/damon/sysfs.c create mode 100644 mm/damon/vaddr-test.h create mode 100644 mm/damon/vaddr.c create mode 100644 mm/debug.c create mode 100644 mm/debug_page_ref.c create mode 100644 mm/debug_vm_pgtable.c create mode 100644 mm/dmapool.c create mode 100644 mm/early_ioremap.c create mode 100644 mm/fadvise.c create mode 100644 mm/failslab.c create mode 100644 mm/filemap.c create mode 100644 mm/folio-compat.c create mode 100644 mm/frontswap.c create mode 100644 mm/gup.c create mode 100644 mm/gup_test.c create mode 100644 mm/gup_test.h create mode 100644 mm/highmem.c create mode 100644 mm/hmm.c create mode 100644 mm/huge_memory.c create mode 100644 mm/hugetlb.c create mode 100644 mm/hugetlb_cgroup.c create mode 100644 mm/hugetlb_vmemmap.c create mode 100644 mm/hugetlb_vmemmap.h create mode 100644 mm/hwpoison-inject.c create mode 100644 mm/init-mm.c create mode 100644 mm/internal.h create mode 100644 mm/interval_tree.c create mode 100644 mm/io-mapping.c create mode 100644 mm/ioremap.c create mode 100644 mm/kasan/Makefile create mode 100644 mm/kasan/common.c create mode 100644 mm/kasan/generic.c create mode 100644 mm/kasan/hw_tags.c create mode 100644 mm/kasan/init.c create mode 100644 mm/kasan/kasan.h create mode 100644 mm/kasan/kasan_test.c create mode 100644 mm/kasan/kasan_test_module.c create mode 100644 mm/kasan/quarantine.c create mode 100644 mm/kasan/report.c create mode 100644 mm/kasan/report_generic.c create mode 100644 mm/kasan/report_hw_tags.c create mode 100644 mm/kasan/report_sw_tags.c create mode 100644 mm/kasan/report_tags.c create mode 100644 mm/kasan/shadow.c create mode 100644 mm/kasan/sw_tags.c create mode 100644 mm/kasan/tags.c create mode 100644 mm/kfence/.kunitconfig create mode 100644 mm/kfence/Makefile create mode 100644 mm/kfence/core.c create mode 100644 mm/kfence/kfence.h create mode 100644 mm/kfence/kfence_test.c create mode 100644 mm/kfence/report.c create mode 100644 mm/khugepaged.c create mode 100644 mm/kmemleak.c create mode 100644 mm/kmsan/Makefile create mode 100644 mm/kmsan/core.c create mode 100644 mm/kmsan/hooks.c create mode 100644 mm/kmsan/init.c create mode 100644 mm/kmsan/instrumentation.c create mode 100644 mm/kmsan/kmsan.h create mode 100644 mm/kmsan/kmsan_test.c create mode 100644 mm/kmsan/report.c create mode 100644 mm/kmsan/shadow.c create mode 100644 mm/ksm.c create mode 100644 mm/list_lru.c create mode 100644 mm/maccess.c create mode 100644 mm/madvise.c create mode 100644 mm/mapping_dirty_helpers.c create mode 100644 mm/memblock.c create mode 100644 mm/memcontrol.c create mode 100644 mm/memfd.c create mode 100644 mm/memory-failure.c create mode 100644 mm/memory-tiers.c create mode 100644 mm/memory.c create mode 100644 mm/memory_hotplug.c create mode 100644 mm/mempolicy.c create mode 100644 mm/mempool.c create mode 100644 mm/memremap.c create mode 100644 mm/memtest.c create mode 100644 mm/migrate.c create mode 100644 mm/migrate_device.c create mode 100644 mm/mincore.c create mode 100644 mm/mlock.c create mode 100644 mm/mm_init.c create mode 100644 mm/mm_slot.h create mode 100644 mm/mmap.c create mode 100644 mm/mmap_lock.c create mode 100644 mm/mmu_gather.c create mode 100644 mm/mmu_notifier.c create mode 100644 mm/mmzone.c create mode 100644 mm/mprotect.c create mode 100644 mm/mremap.c create mode 100644 mm/msync.c create mode 100644 mm/nommu.c create mode 100644 mm/oom_kill.c create mode 100644 mm/page-writeback.c create mode 100644 mm/page_alloc.c create mode 100644 mm/page_counter.c create mode 100644 mm/page_ext.c create mode 100644 mm/page_idle.c create mode 100644 mm/page_io.c create mode 100644 mm/page_isolation.c create mode 100644 mm/page_owner.c create mode 100644 mm/page_poison.c create mode 100644 mm/page_reporting.c create mode 100644 mm/page_reporting.h create mode 100644 mm/page_table_check.c create mode 100644 mm/page_vma_mapped.c create mode 100644 mm/pagewalk.c create mode 100644 mm/percpu-internal.h create mode 100644 mm/percpu-km.c create mode 100644 mm/percpu-stats.c create mode 100644 mm/percpu-vm.c create mode 100644 mm/percpu.c create mode 100644 mm/pgalloc-track.h create mode 100644 mm/pgtable-generic.c create mode 100644 mm/process_vm_access.c create mode 100644 mm/ptdump.c create mode 100644 mm/readahead.c create mode 100644 mm/rmap.c create mode 100644 mm/rodata_test.c create mode 100644 mm/secretmem.c create mode 100644 mm/shmem.c create mode 100644 mm/shrinker_debug.c create mode 100644 mm/shuffle.c create mode 100644 mm/shuffle.h create mode 100644 mm/slab.c create mode 100644 mm/slab.h create mode 100644 mm/slab_common.c create mode 100644 mm/slob.c create mode 100644 mm/slub.c create mode 100644 mm/sparse-vmemmap.c create mode 100644 mm/sparse.c create mode 100644 mm/swap.c create mode 100644 mm/swap.h create mode 100644 mm/swap_cgroup.c create mode 100644 mm/swap_slots.c create mode 100644 mm/swap_state.c create mode 100644 mm/swapfile.c create mode 100644 mm/truncate.c create mode 100644 mm/usercopy.c create mode 100644 mm/userfaultfd.c create mode 100644 mm/util.c create mode 100644 mm/vmalloc.c create mode 100644 mm/vmpressure.c create mode 100644 mm/vmscan.c create mode 100644 mm/vmstat.c create mode 100644 mm/workingset.c create mode 100644 mm/z3fold.c create mode 100644 mm/zbud.c create mode 100644 mm/zpool.c create mode 100644 mm/zsmalloc.c create mode 100644 mm/zswap.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig new file mode 100644 index 000000000..35109a4a2 --- /dev/null +++ b/mm/Kconfig @@ -0,0 +1,1159 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Memory Management options" + +# +# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can +# add proper SWAP support to them, in which case this can be remove. +# +config ARCH_NO_SWAP + bool + +config ZPOOL + bool + +menuconfig SWAP + bool "Support for paging of anonymous memory (swap)" + depends on MMU && BLOCK && !ARCH_NO_SWAP + default y + help + This option allows you to choose whether you want to have support + for so called swap devices or swap files in your kernel that are + used to provide more virtual memory than the actual RAM present + in your computer. If unsure say Y. + +config ZSWAP + bool "Compressed cache for swap pages" + depends on SWAP + select FRONTSWAP + select CRYPTO + select ZPOOL + help + A lightweight compressed cache for swap pages. It takes + pages that are in the process of being swapped out and attempts to + compress them into a dynamically allocated RAM-based memory pool. + This can result in a significant I/O reduction on swap device and, + in the case where decompressing from RAM is faster than swap device + reads, can also improve workload performance. + +config ZSWAP_DEFAULT_ON + bool "Enable the compressed cache for swap pages by default" + depends on ZSWAP + help + If selected, the compressed cache for swap pages will be enabled + at boot, otherwise it will be disabled. + + The selection made here can be overridden by using the kernel + command line 'zswap.enabled=' option. + +choice + prompt "Default compressor" + depends on ZSWAP + default ZSWAP_COMPRESSOR_DEFAULT_LZO + help + Selects the default compression algorithm for the compressed cache + for swap pages. + + For an overview what kind of performance can be expected from + a particular compression algorithm please refer to the benchmarks + available at the following LWN page: + https://lwn.net/Articles/751795/ + + If in doubt, select 'LZO'. + + The selection made here can be overridden by using the kernel + command line 'zswap.compressor=' option. + +config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + bool "Deflate" + select CRYPTO_DEFLATE + help + Use the Deflate algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZO + bool "LZO" + select CRYPTO_LZO + help + Use the LZO algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_842 + bool "842" + select CRYPTO_842 + help + Use the 842 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4 + bool "LZ4" + select CRYPTO_LZ4 + help + Use the LZ4 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + bool "LZ4HC" + select CRYPTO_LZ4HC + help + Use the LZ4HC algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_ZSTD + bool "zstd" + select CRYPTO_ZSTD + help + Use the zstd algorithm as the default compression algorithm. +endchoice + +config ZSWAP_COMPRESSOR_DEFAULT + string + depends on ZSWAP + default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO + default "842" if ZSWAP_COMPRESSOR_DEFAULT_842 + default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4 + default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD + default "" + +choice + prompt "Default allocator" + depends on ZSWAP + default ZSWAP_ZPOOL_DEFAULT_ZBUD + help + Selects the default allocator for the compressed cache for + swap pages. + The default is 'zbud' for compatibility, however please do + read the description of each of the allocators below before + making a right choice. + + The selection made here can be overridden by using the kernel + command line 'zswap.zpool=' option. + +config ZSWAP_ZPOOL_DEFAULT_ZBUD + bool "zbud" + select ZBUD + help + Use the zbud allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_Z3FOLD + bool "z3fold" + select Z3FOLD + help + Use the z3fold allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + bool "zsmalloc" + select ZSMALLOC + help + Use the zsmalloc allocator as the default allocator. +endchoice + +config ZSWAP_ZPOOL_DEFAULT + string + depends on ZSWAP + default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD + default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD + default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + default "" + +config ZBUD + tristate "2:1 compression allocator (zbud)" + depends on ZSWAP + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + +config Z3FOLD + tristate "3:1 compression allocator (z3fold)" + depends on ZSWAP + help + A special purpose allocator for storing compressed pages. + It is designed to store up to three compressed pages per physical + page. It is a ZBUD derivative so the simplicity and determinism are + still there. + +config ZSMALLOC + tristate + prompt "N:1 compression allocator (zsmalloc)" if ZSWAP + depends on MMU + help + zsmalloc is a slab-based memory allocator designed to store + pages of various compression levels efficiently. It achieves + the highest storage density with the least amount of fragmentation. + +config ZSMALLOC_STAT + bool "Export zsmalloc statistics" + depends on ZSMALLOC + select DEBUG_FS + help + This option enables code in the zsmalloc to collect various + statistics about what's happening in zsmalloc and exports that + information to userspace via debugfs. + If unsure, say N. + +menu "SLAB allocator options" + +choice + prompt "Choose SLAB allocator" + default SLUB + help + This option allows to select a slab allocator. + +config SLAB + bool "SLAB" + depends on !PREEMPT_RT + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work + well in all environments. It organizes cache hot objects in + per cpu and per node queues. + +config SLUB + bool "SLUB (Unqueued Allocator)" + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + SLUB is a slab allocator that minimizes cache line usage + instead of managing queues of cached objects (SLAB approach). + Per cpu caching is realized using slabs of objects instead + of queues of objects. SLUB can use memory efficiently + and has enhanced diagnostics. SLUB is the default choice for + a slab allocator. + +config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" + depends on !PREEMPT_RT + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but + does not perform as well on large systems. + +endchoice + +config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" + default y + depends on SLAB || SLUB + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. + This carries a risk of kernel heap overflows being able to + overwrite objects from merged caches (and more easily control + cache layout), which makes such heap attacks easier to exploit + by attackers. By keeping caches unmerged, these kinds of exploits + can usually only damage objects in the same cache. To disable + merging at runtime, "slab_nomerge" can be passed on the kernel + command line. + +config SLAB_FREELIST_RANDOM + bool "Randomize slab freelist" + depends on SLAB || SLUB + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + +config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLAB || SLUB + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifices to harden the kernel slab allocator against common + freelist exploit methods. Some slab implementations have more + sanity-checking than others. This option is most effective with + CONFIG_SLUB. + +config SLUB_STATS + default n + bool "Enable SLUB performance statistics" + depends on SLUB && SYSFS + help + SLUB statistics are useful to debug SLUBs allocation behavior in + order find ways to optimize the allocator. This should never be + enabled for production use since keeping statistics slows down + the allocator by a few percentage points. The slabinfo command + supports the determination of the most active slabs to figure + out which slabs are relevant to a particular load. + Try running: slabinfo -DA + +config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accelerate objects allocation and freeing + that is local to a processor at the price of more indeterminism + in the latency of the free. On overflow these caches will be cleared + which requires the taking of locks that may cause latency spikes. + Typically one would choose no for a realtime system. + +endmenu # SLAB allocator options + +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + +config COMPAT_BRK + bool "Disable heap randomization" + default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). + This option changes the bootup default to heap randomization + disabled, and can be overridden at runtime by setting + /proc/sys/kernel/randomize_va_space to 2. + + On non-ancient distros (post-2000 ones) N is usually a safe choice. + +config MMAP_ALLOW_UNINITIALIZED + bool "Allow mmapped anonymous memory to be uninitialized" + depends on EXPERT && !MMU + default n + help + Normally, and according to the Linux spec, anonymous memory obtained + from mmap() has its contents cleared before it is passed to + userspace. Enabling this config option allows you to request that + mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus + providing a huge performance boost. If this option is not enabled, + then the flag will be ignored. + + This is taken advantage of by uClibc's malloc(), and also by + ELF-FDPIC binfmt's brk and stack allocator. + + Because of the obvious security issues, this option should only be + enabled on embedded devices where you control what is run in + userspace. Since that isn't generally a problem on no-MMU systems, + it is normally safe to say Y here. + + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. + +config SELECT_MEMORY_MODEL + def_bool y + depends on ARCH_SELECT_MEMORY_MODEL + +choice + prompt "Memory model" + depends on SELECT_MEMORY_MODEL + default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT + default FLATMEM_MANUAL + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here selected by the architecture + configuration. This is normal. + +config FLATMEM_MANUAL + bool "Flat Memory" + depends on !ARCH_SPARSEMEM_ENABLE || ARCH_FLATMEM_ENABLE + help + This option is best suited for non-NUMA systems with + flat address space. The FLATMEM is the most efficient + system in terms of performance and resource consumption + and it is the best option for smaller systems. + + For systems that have holes in their physical address + spaces and for features like NUMA and memory hotplug, + choose "Sparse Memory". + + If unsure, choose this option (Flat Memory) over any other. + +config SPARSEMEM_MANUAL + bool "Sparse Memory" + depends on ARCH_SPARSEMEM_ENABLE + help + This will be the only option for some systems, including + memory hot-plug systems. This is normal. + + This option provides efficient support for systems with + holes is their physical address space and allows memory + hot-plug and hot-remove. + + If unsure, choose "Flat Memory" over this option. + +endchoice + +config SPARSEMEM + def_bool y + depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL + +config FLATMEM + def_bool y + depends on !SPARSEMEM || FLATMEM_MANUAL + +# +# SPARSEMEM_EXTREME (which is the default) does some bootmem +# allocations when sparse_init() is called. If this cannot +# be done on your architecture, select this option. However, +# statically allocating the mem_section[] array can potentially +# consume vast quantities of .bss, so be careful. +# +# This option will also potentially produce smaller runtime code +# with gcc 3.4 and later. +# +config SPARSEMEM_STATIC + bool + +# +# Architecture platforms which require a two level mem_section in SPARSEMEM +# must select this option. This is usually for architecture platforms with +# an extremely sparse physical address space. +# +config SPARSEMEM_EXTREME + def_bool y + depends on SPARSEMEM && !SPARSEMEM_STATIC + +config SPARSEMEM_VMEMMAP_ENABLE + bool + +config SPARSEMEM_VMEMMAP + bool "Sparse Memory virtual memmap" + depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE + default y + help + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. + +config HAVE_MEMBLOCK_PHYS_MAP + bool + +config HAVE_FAST_GUP + depends on MMU + bool + +# Don't discard allocated memory used to track "memory" and "reserved" memblocks +# after early boot, so it can still be used to test for validity of memory. +# Also, memblocks are updated with memory hot(un)plug. +config ARCH_KEEP_MEMBLOCK + bool + +# Keep arch NUMA mapping infrastructure post-init. +config NUMA_KEEP_MEMINFO + bool + +config MEMORY_ISOLATION + bool + +# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked +# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via +# /dev/mem. +config EXCLUSIVE_SYSTEM_RAM + def_bool y + depends on !DEVMEM || STRICT_DEVMEM + +# +# Only be set on architectures that have completely implemented memory hotplug +# feature. If you are not sure, don't touch it. +# +config HAVE_BOOTMEM_INFO_NODE + def_bool n + +config ARCH_ENABLE_MEMORY_HOTPLUG + bool + +config ARCH_ENABLE_MEMORY_HOTREMOVE + bool + +# eventually, we can have this option just 'select SPARSEMEM' +menuconfig MEMORY_HOTPLUG + bool "Memory hotplug" + select MEMORY_ISOLATION + depends on SPARSEMEM + depends on ARCH_ENABLE_MEMORY_HOTPLUG + depends on 64BIT + select NUMA_KEEP_MEMINFO if NUMA + +if MEMORY_HOTPLUG + +config MEMORY_HOTPLUG_DEFAULT_ONLINE + bool "Online the newly added memory blocks by default" + depends on MEMORY_HOTPLUG + help + This option sets the default policy setting for memory hotplug + onlining policy (/sys/devices/system/memory/auto_online_blocks) which + determines what happens to newly added memory regions. Policy setting + can always be changed at runtime. + See Documentation/admin-guide/mm/memory-hotplug.rst for more information. + + Say Y here if you want all hot-plugged memory blocks to appear in + 'online' state by default. + Say N here if you want the default policy to keep all hot-plugged + memory blocks in 'offline' state. + +config MEMORY_HOTREMOVE + bool "Allow for memory hot remove" + select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) + depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE + depends on MIGRATION + +config MHP_MEMMAP_ON_MEMORY + def_bool y + depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP + depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + +endif # MEMORY_HOTPLUG + +# Heavily threaded applications may benefit from splitting the mm-wide +# page_table_lock, so that faults on different parts of the user address +# space can be handled with less contention: split it at this NR_CPUS. +# Default to 4 for wider testing, though 8 might be more appropriate. +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. +# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# SPARC32 allocates multiple pte tables within a single page, and therefore +# a per-page lock leads to problems when multiple tables need to be locked +# at the same time (e.g. copy_page_range()). +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. +# +config SPLIT_PTLOCK_CPUS + int + default "999999" if !MMU + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "999999" if SPARC32 + default "4" + +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + bool + +# +# support for memory balloon +config MEMORY_BALLOON + bool + +# +# support for memory balloon compaction +config BALLOON_COMPACTION + bool "Allow for balloon memory compaction/migration" + def_bool y + depends on COMPACTION && MEMORY_BALLOON + help + Memory fragmentation introduced by ballooning might reduce + significantly the number of 2MB contiguous memory blocks that can be + used within a guest, thus imposing performance penalties associated + with the reduced number of transparent huge pages that could be used + by the guest workload. Allowing the compaction & migration for memory + pages enlisted as being part of memory balloon devices avoids the + scenario aforementioned and helps improving memory defragmentation. + +# +# support for memory compaction +config COMPACTION + bool "Allow for memory compaction" + def_bool y + select MIGRATION + depends on MMU + help + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. + +config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION + default 0 if PREEMPT_RT + default 1 + +# +# support for free page reporting +config PAGE_REPORTING + bool "Free page reporting" + def_bool n + help + Free page reporting allows for the incremental acquisition of + free pages from the buddy allocator for the purpose of reporting + those pages to another entity, such as a hypervisor, so that the + memory can be freed within the host for other uses. + +# +# support for page migration +# +config MIGRATION + bool "Page migration" + def_bool y + depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU + help + Allows the migration of the physical location of pages of processes + while the virtual addresses are not changed. This is useful in + two situations. The first is on NUMA systems to put pages nearer + to the processors accessing. The second is when allocating huge + pages as migration can relocate pages to satisfy a huge page + allocation instead of reclaiming. + +config DEVICE_MIGRATION + def_bool MIGRATION && ZONE_DEVICE + +config ARCH_ENABLE_HUGEPAGE_MIGRATION + bool + +config ARCH_ENABLE_THP_MIGRATION + bool + +config HUGETLB_PAGE_SIZE_VARIABLE + def_bool n + help + Allows the pageblock_order value to be dynamic instead of just standard + HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available + on a platform. + + Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be + clamped down to MAX_ORDER - 1. + +config CONTIG_ALLOC + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + +config PHYS_ADDR_T_64BIT + def_bool 64BIT + +config BOUNCE + bool "Enable bounce buffers" + default y + depends on BLOCK && MMU && HIGHMEM + help + Enable bounce buffers for devices that cannot access the full range of + memory available to the CPU. Enabled by default when HIGHMEM is + selected, but you may say n to override this. + +config MMU_NOTIFIER + bool + select SRCU + select INTERVAL_TREE + +config KSM + bool "Enable KSM for page merging" + depends on MMU + select XXHASH + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/mm/ksm.rst for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). + +config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU + default 4096 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. + + This value can be changed after boot using the + /proc/sys/vm/mmap_min_addr tunable. + +config ARCH_SUPPORTS_MEMORY_FAILURE + bool + +config MEMORY_FAILURE + depends on MMU + depends on ARCH_SUPPORTS_MEMORY_FAILURE + bool "Enable recovery from hardware memory errors" + select MEMORY_ISOLATION + select RAS + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "HWPoison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS + select PROC_PAGE_MONITOR + +config NOMMU_INITIAL_TRIM_EXCESS + int "Turn on mmap() excess space trimming before booting" + depends on !MMU + default 1 + help + The NOMMU mmap() frequently needs to allocate large contiguous chunks + of memory on which to store mappings, but it can only ask the system + allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently + more than it requires. To deal with this, mmap() is able to trim off + the excess and return it to the allocator. + + If trimming is enabled, the excess is trimmed off and returned to the + system allocator, which can cause extra fragmentation, particularly + if there are a lot of transient processes. + + If trimming is disabled, the excess is kept, but not used, which for + long-term mappings means that the space is wasted. + + Trimming can be dynamically controlled through a sysctl option + (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of + excess pages there must be before trimming should occur, or zero if + no trimming is to occur. + + This option specifies the initial value of this option. The default + of 1 says that all excess pages should be trimmed. + + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. + +config ARCH_WANT_GENERAL_HUGETLB + bool + +config ARCH_WANTS_THP_SWAP + def_bool n + +menuconfig TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT + select COMPACTION + select XARRAY_MULTI + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + +if TRANSPARENT_HUGEPAGE + +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + +config THP_SWAP + def_bool y + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP + help + Swap transparent huge pages in one piece, without splitting. + XXX: For now, swap cluster backing transparent huge page + will be split after swapout. + + For selection by architectures with reasonable THP sizes. + +config READ_ONLY_THP_FOR_FS + bool "Read-only THP for filesystems (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE && SHMEM + + help + Allow khugepaged to put read-only file-backed pages in THP. + + This is marked experimental because it is a new feature. Write + support of file THPs will be developed in the next few release + cycles. + +endif # TRANSPARENT_HUGEPAGE + +# +# UP and nommu archs use km based percpu allocator +# +config NEED_PER_CPU_KM + depends on !SMP || !MMU + bool + default y + +config NEED_PER_CPU_EMBED_FIRST_CHUNK + bool + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + bool + +config USE_PERCPU_NUMA_NODE_ID + bool + +config HAVE_SETUP_PER_CPU_AREA + bool + +config FRONTSWAP + bool + +config CMA + bool "Contiguous Memory Allocator" + depends on MMU + select MIGRATION + select MEMORY_ISOLATION + help + This enables the Contiguous Memory Allocator which allows other + subsystems to allocate big physically-contiguous blocks of memory. + CMA reserves a region of memory and allows only movable pages to + be allocated from it. This way, the kernel can use the memory for + pagecache and when a subsystem requests for contiguous area, the + allocated pages are migrated away to serve the contiguous request. + + If unsure, say "n". + +config CMA_DEBUG + bool "CMA debug messages (DEVELOPMENT)" + depends on DEBUG_KERNEL && CMA + help + Turns on debug messages in CMA. This produces KERN_DEBUG + messages for every CMA call as well as various messages while + processing calls such as dma_alloc_from_contiguous(). + This option does not affect warning and error messages. + +config CMA_DEBUGFS + bool "CMA debugfs interface" + depends on CMA && DEBUG_FS + help + Turns on the DebugFS interface for CMA. + +config CMA_SYSFS + bool "CMA information through sysfs interface" + depends on CMA && SYSFS + help + This option exposes some sysfs attributes to get information + from CMA. + +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 19 if NUMA + default 7 + help + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7" in UMA and "19" in NUMA. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/admin-guide/mm/soft-dirty.rst for more details. + +config GENERIC_EARLY_IOREMAP + bool + +config STACK_MAX_DEFAULT_SIZE_MB + int "Default maximum user stack size for 32-bit processes (MB)" + default 100 + range 8 2048 + depends on STACK_GROWSUP && (!64BIT || COMPAT) + help + This is the maximum stack size in Megabytes in the VM layout of 32-bit + user processes when the stack grows upwards (currently only on parisc + arch) when the RLIMIT_STACK hard limit is unlimited. + + A sane initial value is 100 MB. + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kthreads" + depends on SPARSEMEM + depends on !NEED_PER_CPU_KM + depends on 64BIT + select PADATA + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel. + This has a potential performance impact on tasks running early in the + lifetime of the system until these kthreads finish the + initialisation. + +config PAGE_IDLE_FLAG + bool + select PAGE_EXTENSION if !64BIT + help + This adds PG_idle and PG_young flags to 'struct page'. PTE Accessed + bit writers can set the state of the bit in the flags so that PTE + Accessed bit readers may avoid disturbance. + +config IDLE_PAGE_TRACKING + bool "Enable idle page tracking" + depends on SYSFS && MMU + select PAGE_IDLE_FLAG + help + This feature allows to estimate the amount of user pages that have + not been touched during a given period of time. This information can + be useful to tune memory cgroup limits and/or for job placement + within a compute cluster. + + See Documentation/admin-guide/mm/idle_page_tracking.rst for + more details. + +config ARCH_HAS_CACHE_LINE_SIZE + bool + +config ARCH_HAS_CURRENT_STACK_POINTER + bool + help + In support of HARDENED_USERCOPY performing stack variable lifetime + checking, an architecture-agnostic way to find the stack pointer + is needed. Once an architecture defines an unsigned long global + register alias named "current_stack_pointer", this config can be + selected. + +config ARCH_HAS_PTE_DEVMAP + bool + +config ARCH_HAS_ZONE_DMA_SET + bool + +config ZONE_DMA + bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET + default y if ARM64 || X86 + +config ZONE_DMA32 + bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET + depends on !X86_32 + default y if ARM64 + +config ZONE_DEVICE + bool "Device memory (pmem, HMM, etc...) hotplug support" + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + depends on ARCH_HAS_PTE_DEVMAP + select XARRAY_MULTI + + help + Device memory hotplug support allows for establishing pmem, + or other device driver discovered memory regions, in the + memmap. This allows pfn_to_page() lookups of otherwise + "device-physical" addresses which is needed for using a DAX + mapping in an O_DIRECT operation, among other things. + + If FS_DAX is enabled, then say Y. + +# +# Helpers to mirror range of the CPU page tables of a process into device page +# tables. +# +config HMM_MIRROR + bool + depends on MMU + +config GET_FREE_REGION + depends on SPARSEMEM + bool + +config DEVICE_PRIVATE + bool "Unaddressable device memory (GPU memory, ...)" + depends on ZONE_DEVICE + select GET_FREE_REGION + + help + Allows creation of struct pages to represent unaddressable device + memory; i.e., memory that is only accessible from the device (or + group of devices). You likely also want to select HMM_MIRROR. + +config VMAP_PFN + bool + +config ARCH_USES_HIGH_VMA_FLAGS + bool +config ARCH_HAS_PKEYS + bool + +config VM_EVENT_COUNTERS + default y + bool "Enable VM event counters for /proc/vmstat" if EXPERT + help + VM event counters are needed for event counts to be shown. + This option allows the disabling of the VM event counters + on EXPERT systems. /proc/vmstat will only show page counts + if VM event counters are disabled. + +config PERCPU_STATS + bool "Collect percpu memory statistics" + help + This feature collects and exposes statistics via debugfs. The + information includes global and per chunk statistics, which can + be used to help understand percpu memory usage. + +config GUP_TEST + bool "Enable infrastructure for get_user_pages()-related unit tests" + depends on DEBUG_FS + help + Provides /sys/kernel/debug/gup_test, which in turn provides a way + to make ioctl calls that can launch kernel-based unit tests for + the get_user_pages*() and pin_user_pages*() family of API calls. + + These tests include benchmark testing of the _fast variants of + get_user_pages*() and pin_user_pages*(), as well as smoke tests of + the non-_fast variants. + + There is also a sub-test that allows running dump_page() on any + of up to eight pages (selected by command line args) within the + range of user-space addresses. These pages are either pinned via + pin_user_pages*(), or pinned via get_user_pages*(), as specified + by other command line arguments. + + See tools/testing/selftests/vm/gup_test.c + +comment "GUP_TEST needs to have DEBUG_FS enabled" + depends on !GUP_TEST && !DEBUG_FS + +config GUP_GET_PTE_LOW_HIGH + bool + +config ARCH_HAS_PTE_SPECIAL + bool + +# +# Some architectures require a special hugepage directory format that is +# required to support multiple hugepage sizes. For example a4fe3ce76 +# "powerpc/mm: Allow more flexible layouts for hugepage pagetables" +# introduced it on powerpc. This allows for a more flexible hugepage +# pagetable layouts. +# +config ARCH_HAS_HUGEPD + bool + +config MAPPING_DIRTY_HELPERS + bool + +config KMAP_LOCAL + bool + +config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY + bool + +# struct io_mapping based helper. Selected by drivers that need them +config IO_MAPPING + bool + +config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + +config USERFAULTFD + bool "Enable userfaultfd() system call" + depends on MMU + help + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + +config HAVE_ARCH_USERFAULTFD_WP + bool + help + Arch has userfaultfd write protection support + +config HAVE_ARCH_USERFAULTFD_MINOR + bool + help + Arch has userfaultfd minor fault support + +config PTE_MARKER + bool + + help + Allows to create marker PTEs for file-backed memory. + +config PTE_MARKER_UFFD_WP + bool "Userfaultfd write protection support for shmem/hugetlbfs" + default y + depends on HAVE_ARCH_USERFAULTFD_WP + select PTE_MARKER + + help + Allows to create marker PTEs for userfaultfd write protection + purposes. It is required to enable userfaultfd write protection on + file-backed memory types like shmem and hugetlbfs. + +# multi-gen LRU { +config LRU_GEN + bool "Multi-Gen LRU" + depends on MMU + # make sure folio->flags has enough spare bits + depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP + help + A high performance LRU implementation to overcommit memory. See + Documentation/admin-guide/mm/multigen_lru.rst for details. + +config LRU_GEN_ENABLED + bool "Enable by default" + depends on LRU_GEN + help + This option enables the multi-gen LRU by default. + +config LRU_GEN_STATS + bool "Full stats for debugging" + depends on LRU_GEN + help + Do not enable this option unless you plan to look at historical stats + from evicted generations for debugging purpose. + + This option has a per-memcg and per-node memory overhead. +# } + +config LOCK_MM_AND_FIND_VMA + bool + depends on !STACK_GROWSUP + +source "mm/damon/Kconfig" + +endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug new file mode 100644 index 000000000..32c2df12a --- /dev/null +++ b/mm/Kconfig.debug @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: GPL-2.0-only +config PAGE_EXTENSION + bool "Extend memmap on extra space for more information on page" + help + Extend memmap on extra space for more information on page. This + could be used for debugging features that need to insert extra + field for every page. This extension enables us to save memory + by not allocating this extra memory according to boottime + configuration. + +config DEBUG_PAGEALLOC + bool "Debug page memory allocations" + depends on DEBUG_KERNEL + depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC + select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC + help + Unmap pages from the kernel linear mapping after free_pages(). + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. + + Also, the state of page tracking structures is checked more often as + pages are being allocated and freed, as unexpected state changes + often happen for same reasons as memory corruption (e.g. double free, + use-after-free). The error reports for these checks can be augmented + with stack traces of last allocation and freeing of the page, when + PAGE_OWNER is also selected and enabled on boot. + + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, + fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). Additionally, this option cannot + be enabled in combination with hibernation as that would result in + incorrect warnings of memory corruption after a resume because free + pages are not saved to the suspend image. + + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + depends on DEBUG_PAGEALLOC + help + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + +config DEBUG_SLAB + bool "Debug slab memory allocations" + depends on DEBUG_KERNEL && SLAB + help + Say Y here to have the kernel do limited verification on memory + allocation as well as poisoning memory on free to catch use of freed + memory. This can make kmalloc/kfree-intensive workloads much slower. + +config SLUB_DEBUG + default y + bool "Enable SLUB debugging support" if EXPERT + depends on SLUB && SYSFS + select STACKDEPOT if STACKTRACE_SUPPORT + help + SLUB has extensive debug support features. Disabling these can + result in significant savings in code size. This also disables + SLUB sysfs support. /sys/slab will not exist and there will be + no support for cache validation etc. + +config SLUB_DEBUG_ON + bool "SLUB debugging on by default" + depends on SLUB && SLUB_DEBUG + select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT + default n + help + Boot with debugging on by default. SLUB boots by default with + the runtime debug capabilities switched off. Enabling this is + equivalent to specifying the "slub_debug" parameter on boot. + There is no support for more fine grained debug control like + possible with slub_debug=xxx. SLUB debugging may be switched + off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying + "slub_debug=-". + +config PAGE_OWNER + bool "Track page owner" + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + select DEBUG_FS + select STACKTRACE + select STACKDEPOT + select PAGE_EXTENSION + help + This keeps track of what call chain is the owner of a page, may + help to find bare alloc_page(s) leaks. Even if you include this + feature on your build, it is disabled in default. You should pass + "page_owner=on" to boot parameter in order to enable it. Eats + a fair amount of memory if enabled. See tools/vm/page_owner_sort.c + for user-space helper. + + If unsure, say N. + +config PAGE_TABLE_CHECK + bool "Check for invalid mappings in user page tables" + depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK + depends on EXCLUSIVE_SYSTEM_RAM + select PAGE_EXTENSION + help + Check that anonymous page is not being mapped twice with read write + permissions. Check that anonymous and file pages are not being + erroneously shared. Since the checking is performed at the time + entries are added and removed to user page tables, leaking, corruption + and double mapping problems are detected synchronously. + + If unsure say "n". + +config PAGE_TABLE_CHECK_ENFORCED + bool "Enforce the page table checking by default" + depends on PAGE_TABLE_CHECK + help + Always enable page table checking. By default the page table checking + is disabled, and can be optionally enabled via page_table_check=on + kernel parameter. This config enforces that page table check is always + enabled. + + If unsure say "n". + +config PAGE_POISONING + bool "Poison pages after freeing" + help + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact if enabled with the + "page_poison=1" kernel boot option. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If you are only interested in sanitization of freed pages without + checking the poison pattern on alloc, you can boot the kernel with + "init_on_free=1" instead of enabling this. + + If unsure, say N + +config DEBUG_PAGE_REF + bool "Enable tracepoint to track down page reference manipulation" + depends on DEBUG_KERNEL + depends on TRACEPOINTS + help + This is a feature to add tracepoint for tracking down page reference + manipulation. This tracking is useful to diagnose functional failure + due to migration failures caused by page reference mismatches. Be + careful when enabling this feature because it adds about 30 KB to the + kernel code. However the runtime performance overhead is virtually + nil until the tracepoints are actually enabled. + +config DEBUG_RODATA_TEST + bool "Testcase for the marking rodata read-only" + depends on STRICT_KERNEL_RWX + help + This option enables a testcase for the setting rodata read-only. + +config ARCH_HAS_DEBUG_WX + bool + +config DEBUG_WX + bool "Warn on W+X mappings at boot" + depends on ARCH_HAS_DEBUG_WX + depends on MMU + select PTDUMP_CORE + help + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving W+X + mappings after applying NX, as such mappings are a security risk. + + Look for a message in dmesg output like this: + + /mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + /mm: Checked W+X mappings: failed, W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + +config GENERIC_PTDUMP + bool + +config PTDUMP_CORE + bool + +config PTDUMP_DEBUGFS + bool "Export kernel pagetable layout to userspace via debugfs" + depends on DEBUG_KERNEL + depends on DEBUG_FS + depends on GENERIC_PTDUMP + select PTDUMP_CORE + help + Say Y here if you want to show the kernel pagetable layout in a + debugfs file. This information is only useful for kernel developers + who are working in architecture specific areas of the kernel. + It is probably not a good idea to enable this feature in a production + kernel. + + If in doubt, say N. diff --git a/mm/Makefile b/mm/Makefile new file mode 100644 index 000000000..8e105e5b3 --- /dev/null +++ b/mm/Makefile @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux memory manager. +# + +KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slab.o := n +KASAN_SANITIZE_slub.o := n +KCSAN_SANITIZE_kmemleak.o := n + +# These produce frequent data race reports: most of them are due to races on +# the same word but accesses to different bits of that word. Re-enable KCSAN +# for these when we have more consensus on what to do about them. +KCSAN_SANITIZE_slab_common.o := n +KCSAN_SANITIZE_slab.o := n +KCSAN_SANITIZE_slub.o := n +KCSAN_SANITIZE_page_alloc.o := n +# But enable explicit instrumentation for memory barriers. +KCSAN_INSTRUMENT_BARRIERS := y + +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n +KCOV_INSTRUMENT_failslab.o := n + +CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) +CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ + mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ + msync.o page_vma_mapped.o pagewalk.o \ + pgtable-generic.o rmap.o vmalloc.o + + +ifdef CONFIG_CROSS_MEMORY_ATTACH +mmu-$(CONFIG_MMU) += process_vm_access.o +endif + +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ + maccess.o page-writeback.o folio-compat.o \ + readahead.o swap.o truncate.o vmscan.o shmem.o \ + util.o mmzone.o vmstat.o backing-dev.o \ + mm_init.o percpu.o slab_common.o \ + compaction.o \ + interval_tree.o list_lru.o workingset.o \ + debug.o gup.o mmap_lock.o $(mmu-y) + +# Give 'page_alloc' its own module-parameter namespace +page-alloc-y := page_alloc.o +page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o + +# Give 'memory_hotplug' its own module-parameter namespace +memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + +obj-y += page-alloc.o +obj-y += init-mm.o +obj-y += memblock.o +obj-y += $(memory-hotplug-y) + +ifdef CONFIG_MMU + obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o +endif + +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o +obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZSWAP) += zswap.o +obj-$(CONFIG_HAS_DMA) += dmapool.o +obj-$(CONFIG_HUGETLBFS) += hugetlb.o +obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_NUMA) += mempolicy.o +obj-$(CONFIG_SPARSEMEM) += sparse.o +obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o +obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o +obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_SLUB) += slub.o +obj-$(CONFIG_KASAN) += kasan/ +obj-$(CONFIG_KFENCE) += kfence/ +obj-$(CONFIG_KMSAN) += kmsan/ +obj-$(CONFIG_FAILSLAB) += failslab.o +obj-$(CONFIG_MEMTEST) += memtest.o +obj-$(CONFIG_MIGRATION) += migrate.o +obj-$(CONFIG_NUMA) += memory-tiers.o +obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o +obj-$(CONFIG_PAGE_COUNTER) += page_counter.o +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +ifdef CONFIG_SWAP +obj-$(CONFIG_MEMCG) += swap_cgroup.o +endif +obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o +obj-$(CONFIG_GUP_TEST) += gup_test.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o +obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o +obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o +obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o +obj-$(CONFIG_PAGE_OWNER) += page_owner.o +obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o +obj-$(CONFIG_ZBUD) += zbud.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_Z3FOLD) += z3fold.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o +obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o +obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o +obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_SECRETMEM) += secretmem.o +obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o +obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o +obj-$(CONFIG_DAMON) += damon/ +obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o +obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o +obj-$(CONFIG_ZONE_DEVICE) += memremap.o +obj-$(CONFIG_HMM_MIRROR) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o +obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o +obj-$(CONFIG_IO_MAPPING) += io-mapping.o +obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o +obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c new file mode 100644 index 000000000..bf5525c2e --- /dev/null +++ b/mm/backing-dev.c @@ -0,0 +1,1003 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct backing_dev_info noop_backing_dev_info; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); + +static struct class *bdi_class; +static const char *bdi_unknown_name = "(unknown)"; + +/* + * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU + * reader side locking. + */ +DEFINE_SPINLOCK(bdi_lock); +static u64 bdi_id_cursor; +static struct rb_root bdi_tree = RB_ROOT; +LIST_HEAD(bdi_list); + +/* bdi_wq serves all asynchronous writeback tasks */ +struct workqueue_struct *bdi_wq; + +#define K(x) ((x) << (PAGE_SHIFT - 10)) + +#ifdef CONFIG_DEBUG_FS +#include +#include + +static struct dentry *bdi_debug_root; + +static void bdi_debug_init(void) +{ + bdi_debug_root = debugfs_create_dir("bdi", NULL); +} + +static int bdi_debug_stats_show(struct seq_file *m, void *v) +{ + struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb = &bdi->wb; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long wb_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; + struct inode *inode; + + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; + spin_lock(&wb->list_lock); + list_for_each_entry(inode, &wb->b_dirty, i_io_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_io_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_io_list) + nr_more_io++; + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) + if (inode->i_state & I_DIRTY_TIME) + nr_dirty_time++; + spin_unlock(&wb->list_lock); + + global_dirty_limits(&background_thresh, &dirty_thresh); + wb_thresh = wb_calc_thresh(wb, dirty_thresh); + + seq_printf(m, + "BdiWriteback: %10lu kB\n" + "BdiReclaimable: %10lu kB\n" + "BdiDirtyThresh: %10lu kB\n" + "DirtyThresh: %10lu kB\n" + "BackgroundThresh: %10lu kB\n" + "BdiDirtied: %10lu kB\n" + "BdiWritten: %10lu kB\n" + "BdiWriteBandwidth: %10lu kBps\n" + "b_dirty: %10lu\n" + "b_io: %10lu\n" + "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" + "bdi_list: %10u\n" + "state: %10lx\n", + (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), + (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), + K(wb_thresh), + K(dirty_thresh), + K(background_thresh), + (unsigned long) K(wb_stat(wb, WB_DIRTIED)), + (unsigned long) K(wb_stat(wb, WB_WRITTEN)), + (unsigned long) K(wb->write_bandwidth), + nr_dirty, + nr_io, + nr_more_io, + nr_dirty_time, + !list_empty(&bdi->bdi_list), bdi->wb.state); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); + +static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) +{ + bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); + + debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, + &bdi_debug_stats_fops); +} + +static void bdi_debug_unregister(struct backing_dev_info *bdi) +{ + debugfs_remove_recursive(bdi->debug_dir); +} +#else +static inline void bdi_debug_init(void) +{ +} +static inline void bdi_debug_register(struct backing_dev_info *bdi, + const char *name) +{ +} +static inline void bdi_debug_unregister(struct backing_dev_info *bdi) +{ +} +#endif + +static ssize_t read_ahead_kb_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned long read_ahead_kb; + ssize_t ret; + + ret = kstrtoul(buf, 10, &read_ahead_kb); + if (ret < 0) + return ret; + + bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); + + return count; +} + +#define BDI_SHOW(name, expr) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct backing_dev_info *bdi = dev_get_drvdata(dev); \ + \ + return sysfs_emit(buf, "%lld\n", (long long)expr); \ +} \ +static DEVICE_ATTR_RW(name); + +BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) + +static ssize_t min_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_ratio, bdi->min_ratio) + +static ssize_t max_ratio_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_ratio, bdi->max_ratio) + +static ssize_t stable_pages_required_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + dev_warn_once(dev, + "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); + return sysfs_emit(buf, "%d\n", 0); +} +static DEVICE_ATTR_RO(stable_pages_required); + +static struct attribute *bdi_dev_attrs[] = { + &dev_attr_read_ahead_kb.attr, + &dev_attr_min_ratio.attr, + &dev_attr_max_ratio.attr, + &dev_attr_stable_pages_required.attr, + NULL, +}; +ATTRIBUTE_GROUPS(bdi_dev); + +static __init int bdi_class_init(void) +{ + bdi_class = class_create(THIS_MODULE, "bdi"); + if (IS_ERR(bdi_class)) + return PTR_ERR(bdi_class); + + bdi_class->dev_groups = bdi_dev_groups; + bdi_debug_init(); + + return 0; +} +postcore_initcall(bdi_class_init); + +static int __init default_bdi_init(void) +{ + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | + WQ_SYSFS, 0); + if (!bdi_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(default_bdi_init); + +/* + * This function is used when the first inode for this wb is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +void wb_wakeup_delayed(struct bdi_writeback *wb) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_irq(&wb->work_lock); +} + +static void wb_update_bandwidth_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, bw_dwork); + + wb_update_bandwidth(wb); +} + +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + +static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, + gfp_t gfp) +{ + int i, err; + + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); + spin_lock_init(&wb->list_lock); + + atomic_set(&wb->writeback_inodes, 0); + wb->bw_time_stamp = jiffies; + wb->balanced_dirty_ratelimit = INIT_BW; + wb->dirty_ratelimit = INIT_BW; + wb->write_bandwidth = INIT_BW; + wb->avg_write_bandwidth = INIT_BW; + + spin_lock_init(&wb->work_lock); + INIT_LIST_HEAD(&wb->work_list); + INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn); + wb->dirty_sleep = jiffies; + + err = fprop_local_init_percpu(&wb->completions, gfp); + if (err) + return err; + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) { + err = percpu_counter_init(&wb->stat[i], 0, gfp); + if (err) + goto out_destroy_stat; + } + + return 0; + +out_destroy_stat: + while (i--) + percpu_counter_destroy(&wb->stat[i]); + fprop_local_destroy_percpu(&wb->completions); + return err; +} + +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); + +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void wb_shutdown(struct bdi_writeback *wb) +{ + /* Make sure nobody queues further work */ + spin_lock_irq(&wb->work_lock); + if (!test_and_clear_bit(WB_registered, &wb->state)) { + spin_unlock_irq(&wb->work_lock); + return; + } + spin_unlock_irq(&wb->work_lock); + + cgwb_remove_from_bdi_list(wb); + /* + * Drain work list and shutdown the delayed_work. !WB_registered + * tells wb_workfn() that @wb is dying and its work_list needs to + * be drained no matter what. + */ + mod_delayed_work(bdi_wq, &wb->dwork, 0); + flush_delayed_work(&wb->dwork); + WARN_ON(!list_empty(&wb->work_list)); + flush_delayed_work(&wb->bw_dwork); +} + +static void wb_exit(struct bdi_writeback *wb) +{ + int i; + + WARN_ON(delayed_work_pending(&wb->dwork)); + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) + percpu_counter_destroy(&wb->stat[i]); + + fprop_local_destroy_percpu(&wb->completions); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +#include + +/* + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and + * memcg->cgwb_list. bdi->cgwb_tree is also RCU protected. + */ +static DEFINE_SPINLOCK(cgwb_lock); +static struct workqueue_struct *cgwb_release_wq; + +static LIST_HEAD(offline_cgwbs); +static void cleanup_offline_cgwbs_workfn(struct work_struct *work); +static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn); + +static void cgwb_free_rcu(struct rcu_head *rcu_head) +{ + struct bdi_writeback *wb = container_of(rcu_head, + struct bdi_writeback, rcu); + + percpu_ref_exit(&wb->refcnt); + kfree(wb); +} + +static void cgwb_release_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(work, struct bdi_writeback, + release_work); + struct backing_dev_info *bdi = wb->bdi; + + mutex_lock(&wb->bdi->cgwb_release_mutex); + wb_shutdown(wb); + + css_put(wb->memcg_css); + css_put(wb->blkcg_css); + mutex_unlock(&wb->bdi->cgwb_release_mutex); + + /* triggers blkg destruction if no online users left */ + blkcg_unpin_online(wb->blkcg_css); + + fprop_local_destroy_percpu(&wb->memcg_completions); + + spin_lock_irq(&cgwb_lock); + list_del(&wb->offline_node); + spin_unlock_irq(&cgwb_lock); + + wb_exit(wb); + bdi_put(bdi); + WARN_ON_ONCE(!list_empty(&wb->b_attached)); + call_rcu(&wb->rcu, cgwb_free_rcu); +} + +static void cgwb_release(struct percpu_ref *refcnt) +{ + struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, + refcnt); + queue_work(cgwb_release_wq, &wb->release_work); +} + +static void cgwb_kill(struct bdi_writeback *wb) +{ + lockdep_assert_held(&cgwb_lock); + + WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + list_del(&wb->memcg_node); + list_del(&wb->blkcg_node); + list_add(&wb->offline_node, &offline_cgwbs); + percpu_ref_kill(&wb->refcnt); +} + +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + spin_lock_irq(&cgwb_lock); + list_del_rcu(&wb->bdi_node); + spin_unlock_irq(&cgwb_lock); +} + +static int cgwb_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct cgroup_subsys_state *blkcg_css; + struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; + struct bdi_writeback *wb; + unsigned long flags; + int ret = 0; + + memcg = mem_cgroup_from_css(memcg_css); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + memcg_cgwb_list = &memcg->cgwb_list; + blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); + + /* look up again under lock and discard on blkcg mismatch */ + spin_lock_irqsave(&cgwb_lock, flags); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb && wb->blkcg_css != blkcg_css) { + cgwb_kill(wb); + wb = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (wb) + goto out_put; + + /* need to create a new one */ + wb = kmalloc(sizeof(*wb), gfp); + if (!wb) { + ret = -ENOMEM; + goto out_put; + } + + ret = wb_init(wb, bdi, gfp); + if (ret) + goto err_free; + + ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); + if (ret) + goto err_wb_exit; + + ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); + if (ret) + goto err_ref_exit; + + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); + bdi_get(bdi); + + /* + * The root wb determines the registered state of the whole bdi and + * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate + * whether they're still online. Don't link @wb if any is dead. + * See wb_memcg_offline() and wb_blkcg_offline(). + */ + ret = -ENODEV; + spin_lock_irqsave(&cgwb_lock, flags); + if (test_bit(WB_registered, &bdi->wb.state) && + blkcg_cgwb_list->next && memcg_cgwb_list->next) { + /* we might have raced another instance of this function */ + ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + if (!ret) { + list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); + list_add(&wb->memcg_node, memcg_cgwb_list); + list_add(&wb->blkcg_node, blkcg_cgwb_list); + blkcg_pin_online(blkcg_css); + css_get(memcg_css); + css_get(blkcg_css); + } + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto err_fprop_exit; + } + goto out_put; + +err_fprop_exit: + bdi_put(bdi); + fprop_local_destroy_percpu(&wb->memcg_completions); +err_ref_exit: + percpu_ref_exit(&wb->refcnt); +err_wb_exit: + wb_exit(wb); +err_free: + kfree(wb); +out_put: + css_put(blkcg_css); + return ret; +} + +/** + * wb_get_lookup - get wb for a given memcg + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * + * Try to get the wb for @memcg_css on @bdi. The returned wb has its + * refcount incremented. + * + * This function uses css_get() on @memcg_css and thus expects its refcnt + * to be positive on invocation. IOW, rcu_read_lock() protection on + * @memcg_css isn't enough. try_get it before calling this function. + * + * A wb is keyed by its associated memcg. As blkcg implicitly enables + * memcg on the default hierarchy, memcg association is guaranteed to be + * more specific (equal or descendant to the associated blkcg) and thus can + * identify both the memcg and blkcg associations. + * + * Because the blkcg associated with a memcg may change as blkcg is enabled + * and disabled closer to root in the hierarchy, each wb keeps track of + * both the memcg and blkcg associated with it and verifies the blkcg on + * each lookup. On mismatch, the existing wb is discarded and a new one is + * created. + */ +struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css) +{ + struct bdi_writeback *wb; + + if (!memcg_css->parent) + return &bdi->wb; + + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + + return wb; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. See wb_get_lookup() for more details. + */ +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp) +{ + struct bdi_writeback *wb; + + might_alloc(gfp); + + if (!memcg_css->parent) + return &bdi->wb; + + do { + wb = wb_get_lookup(bdi, memcg_css); + } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + + return wb; +} + +static int cgwb_bdi_init(struct backing_dev_info *bdi) +{ + int ret; + + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + mutex_init(&bdi->cgwb_release_mutex); + init_rwsem(&bdi->wb_switch_rwsem); + + ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); + if (!ret) { + bdi->wb.memcg_css = &root_mem_cgroup->css; + bdi->wb.blkcg_css = blkcg_root_css; + } + return ret; +} + +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) +{ + struct radix_tree_iter iter; + void **slot; + struct bdi_writeback *wb; + + WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + + spin_lock_irq(&cgwb_lock); + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) + cgwb_kill(*slot); + spin_unlock_irq(&cgwb_lock); + + mutex_lock(&bdi->cgwb_release_mutex); + spin_lock_irq(&cgwb_lock); + while (!list_empty(&bdi->wb_list)) { + wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, + bdi_node); + spin_unlock_irq(&cgwb_lock); + wb_shutdown(wb); + spin_lock_irq(&cgwb_lock); + } + spin_unlock_irq(&cgwb_lock); + mutex_unlock(&bdi->cgwb_release_mutex); +} + +/* + * cleanup_offline_cgwbs_workfn - try to release dying cgwbs + * + * Try to release dying cgwbs by switching attached inodes to the nearest + * living ancestor's writeback. Processed wbs are placed at the end + * of the list to guarantee the forward progress. + */ +static void cleanup_offline_cgwbs_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb; + LIST_HEAD(processed); + + spin_lock_irq(&cgwb_lock); + + while (!list_empty(&offline_cgwbs)) { + wb = list_first_entry(&offline_cgwbs, struct bdi_writeback, + offline_node); + list_move(&wb->offline_node, &processed); + + /* + * If wb is dirty, cleaning up the writeback by switching + * attached inodes will result in an effective removal of any + * bandwidth restrictions, which isn't the goal. Instead, + * it can be postponed until the next time, when all io + * will be likely completed. If in the meantime some inodes + * will get re-dirtied, they should be eventually switched to + * a new cgwb. + */ + if (wb_has_dirty_io(wb)) + continue; + + if (!wb_tryget(wb)) + continue; + + spin_unlock_irq(&cgwb_lock); + while (cleanup_offline_cgwb(wb)) + cond_resched(); + spin_lock_irq(&cgwb_lock); + + wb_put(wb); + } + + if (!list_empty(&processed)) + list_splice_tail(&processed, &offline_cgwbs); + + spin_unlock_irq(&cgwb_lock); +} + +/** + * wb_memcg_offline - kill all wb's associated with a memcg being offlined + * @memcg: memcg being offlined + * + * Also prevents creation of any new wb's associated with @memcg. + */ +void wb_memcg_offline(struct mem_cgroup *memcg) +{ + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) + cgwb_kill(wb); + memcg_cgwb_list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); + + queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work); +} + +/** + * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined + * @css: blkcg being offlined + * + * Also prevents creation of any new wb's associated with @blkcg. + */ +void wb_blkcg_offline(struct cgroup_subsys_state *css) +{ + struct bdi_writeback *wb, *next; + struct list_head *list = blkcg_get_cgwb_list(css); + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, list, blkcg_node) + cgwb_kill(wb); + list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + spin_lock_irq(&cgwb_lock); + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + spin_unlock_irq(&cgwb_lock); +} + +static int __init cgwb_init(void) +{ + /* + * There can be many concurrent release work items overwhelming + * system_wq. Put them in a separate wq and limit concurrency. + * There's no point in executing many of these in parallel. + */ + cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); + if (!cgwb_release_wq) + return -ENOMEM; + + return 0; +} +subsys_initcall(cgwb_init); + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int cgwb_bdi_init(struct backing_dev_info *bdi) +{ + return wb_init(&bdi->wb, bdi, GFP_KERNEL); +} + +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } + +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); +} + +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + list_del_rcu(&wb->bdi_node); +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +int bdi_init(struct backing_dev_info *bdi) +{ + bdi->dev = NULL; + + kref_init(&bdi->refcnt); + bdi->min_ratio = 0; + bdi->max_ratio = 100; + bdi->max_prop_frac = FPROP_FRAC_BASE; + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); + init_waitqueue_head(&bdi->wb_waitq); + + return cgwb_bdi_init(bdi); +} + +struct backing_dev_info *bdi_alloc(int node_id) +{ + struct backing_dev_info *bdi; + + bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); + if (!bdi) + return NULL; + + if (bdi_init(bdi)) { + kfree(bdi); + return NULL; + } + bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = VM_READAHEAD_PAGES; + timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); + return bdi; +} +EXPORT_SYMBOL(bdi_alloc); + +static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) +{ + struct rb_node **p = &bdi_tree.rb_node; + struct rb_node *parent = NULL; + struct backing_dev_info *bdi; + + lockdep_assert_held(&bdi_lock); + + while (*p) { + parent = *p; + bdi = rb_entry(parent, struct backing_dev_info, rb_node); + + if (bdi->id > id) + p = &(*p)->rb_left; + else if (bdi->id < id) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * bdi_get_by_id - lookup and get bdi from its id + * @id: bdi id to lookup + * + * Find bdi matching @id and get it. Returns NULL if the matching bdi + * doesn't exist or is already unregistered. + */ +struct backing_dev_info *bdi_get_by_id(u64 id) +{ + struct backing_dev_info *bdi = NULL; + struct rb_node **p; + + spin_lock_bh(&bdi_lock); + p = bdi_lookup_rb_node(id, NULL); + if (*p) { + bdi = rb_entry(*p, struct backing_dev_info, rb_node); + bdi_get(bdi); + } + spin_unlock_bh(&bdi_lock); + + return bdi; +} + +int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) +{ + struct device *dev; + struct rb_node *parent, **p; + + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; + + vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); + dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + cgwb_bdi_register(bdi); + bdi->dev = dev; + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); + + spin_lock_bh(&bdi_lock); + + bdi->id = ++bdi_id_cursor; + + p = bdi_lookup_rb_node(bdi->id, &parent); + rb_link_node(&bdi->rb_node, parent, p); + rb_insert_color(&bdi->rb_node, &bdi_tree); + + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; +} + +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = bdi_register_va(bdi, fmt, args); + va_end(args); + return ret; +} +EXPORT_SYMBOL(bdi_register); + +void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) +{ + WARN_ON_ONCE(bdi->owner); + bdi->owner = owner; + get_device(owner); +} + +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + rb_erase(&bdi->rb_node, &bdi_tree); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu_expedited(); +} + +void bdi_unregister(struct backing_dev_info *bdi) +{ + del_timer_sync(&bdi->laptop_mode_wb_timer); + + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); + cgwb_bdi_unregister(bdi); + + /* + * If this BDI's min ratio has been set, use bdi_set_min_ratio() to + * update the global bdi_min_ratio. + */ + if (bdi->min_ratio) + bdi_set_min_ratio(bdi, 0); + + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } + + if (bdi->owner) { + put_device(bdi->owner); + bdi->owner = NULL; + } +} +EXPORT_SYMBOL(bdi_unregister); + +static void release_bdi(struct kref *ref) +{ + struct backing_dev_info *bdi = + container_of(ref, struct backing_dev_info, refcnt); + + WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); + WARN_ON_ONCE(bdi->dev); + wb_exit(&bdi->wb); + kfree(bdi); +} + +void bdi_put(struct backing_dev_info *bdi) +{ + kref_put(&bdi->refcnt, release_bdi); +} +EXPORT_SYMBOL(bdi_put); + +struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return I_BDEV(inode)->bd_disk->bdi; +#endif + return sb->s_bdi; +} +EXPORT_SYMBOL(inode_to_bdi); + +const char *bdi_dev_name(struct backing_dev_info *bdi) +{ + if (!bdi || !bdi->dev) + return bdi_unknown_name; + return bdi->dev_name; +} +EXPORT_SYMBOL_GPL(bdi_dev_name); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000..22c96fed7 --- /dev/null +++ b/mm/balloon_compaction.c @@ -0,0 +1,258 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/balloon_compaction.c + * + * Common interface for making balloon pages movable by compaction. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info, + struct page *page) +{ + /* + * Block others from accessing the 'page' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'page' at this point. If we are not, then + * memory corruption is possible and we should stop execution. + */ + BUG_ON(!trylock_page(page)); + balloon_page_insert(b_dev_info, page); + unlock_page(page); + __count_vm_event(BALLOON_INFLATE); +} + +/** + * balloon_page_list_enqueue() - inserts a list of pages into the balloon page + * list. + * @b_dev_info: balloon device descriptor where we will insert a new page to + * @pages: pages to enqueue - allocated using balloon_page_alloc. + * + * Driver must call this function to properly enqueue balloon pages before + * definitively removing them from the guest system. + * + * Return: number of pages that were enqueued. + */ +size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info, + struct list_head *pages) +{ + struct page *page, *tmp; + unsigned long flags; + size_t n_pages = 0; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_for_each_entry_safe(page, tmp, pages, lru) { + list_del(&page->lru); + balloon_page_enqueue_one(b_dev_info, page); + n_pages++; + } + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + return n_pages; +} +EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); + +/** + * balloon_page_list_dequeue() - removes pages from balloon's page list and + * returns a list of the pages. + * @b_dev_info: balloon device descriptor where we will grab a page from. + * @pages: pointer to the list of pages that would be returned to the caller. + * @n_req_pages: number of requested pages. + * + * Driver must call this function to properly de-allocate a previous enlisted + * balloon pages before definitively releasing it back to the guest system. + * This function tries to remove @n_req_pages from the ballooned pages and + * return them to the caller in the @pages list. + * + * Note that this function may fail to dequeue some pages even if the balloon + * isn't empty - since the page list can be temporarily empty due to compaction + * of isolated pages. + * + * Return: number of pages that were added to the @pages list. + */ +size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info, + struct list_head *pages, size_t n_req_pages) +{ + struct page *page, *tmp; + unsigned long flags; + size_t n_pages = 0; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + if (n_pages == n_req_pages) + break; + + /* + * Block others from accessing the 'page' while we get around to + * establishing additional references and preparing the 'page' + * to be released by the balloon driver. + */ + if (!trylock_page(page)) + continue; + + if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) && + PageIsolated(page)) { + /* raced with isolation */ + unlock_page(page); + continue; + } + balloon_page_delete(page); + __count_vm_event(BALLOON_DEFLATE); + list_add(&page->lru, pages); + unlock_page(page); + n_pages++; + } + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + return n_pages; +} +EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); + +/* + * balloon_page_alloc - allocates a new page for insertion into the balloon + * page list. + * + * Driver must call this function to properly allocate a new balloon page. + * Driver must call balloon_page_enqueue before definitively removing the page + * from the guest system. + * + * Return: struct page for the allocated page or NULL on allocation failure. + */ +struct page *balloon_page_alloc(void) +{ + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_alloc); + +/* + * balloon_page_enqueue - inserts a new page into the balloon page list. + * + * @b_dev_info: balloon device descriptor where we will insert a new page + * @page: new page to enqueue - allocated using balloon_page_alloc. + * + * Drivers must call this function to properly enqueue a new allocated balloon + * page before definitively removing the page from the guest system. + * + * Drivers must not call balloon_page_enqueue on pages that have been pushed to + * a list with balloon_page_push before removing them with balloon_page_pop. To + * enqueue a list of pages, use balloon_page_list_enqueue instead. + */ +void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, + struct page *page) +{ + unsigned long flags; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_enqueue_one(b_dev_info, page); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/* + * balloon_page_dequeue - removes a page from balloon's page list and returns + * its address to allow the driver to release the page. + * @b_dev_info: balloon device descriptor where we will grab a page from. + * + * Driver must call this function to properly dequeue a previously enqueued page + * before definitively releasing it back to the guest system. + * + * Caller must perform its own accounting to ensure that this + * function is called only if some pages are actually enqueued. + * + * Note that this function may fail to dequeue some pages even if there are + * some enqueued pages - since the page list can be temporarily empty due to + * the compaction of isolated pages. + * + * TODO: remove the caller accounting requirements, and allow caller to wait + * until all pages can be dequeued. + * + * Return: struct page for the dequeued page, or NULL if no page was dequeued. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + LIST_HEAD(pages); + int n_pages; + + n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1); + + if (n_pages != 1) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there are no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck in + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + return NULL; + } + return list_first_entry(&pages, struct page, lru); +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION + +static bool balloon_page_isolate(struct page *page, isolate_mode_t mode) + +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + + return true; +} + +static void balloon_page_putback(struct page *page) +{ + struct balloon_dev_info *b_dev_info = balloon_page_device(page); + unsigned long flags; + + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +/* move_to_new_page() counterpart for a ballooned page */ +static int balloon_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + struct balloon_dev_info *balloon = balloon_page_device(page); + + /* + * We can not easily support the no copy case here so ignore it as it + * is unlikely to be used with balloon pages. See include/linux/hmm.h + * for a user of the MIGRATE_SYNC_NO_COPY mode. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + + return balloon->migratepage(balloon, newpage, page, mode); +} + +const struct movable_operations balloon_mops = { + .migrate_page = balloon_page_migrate, + .isolate_page = balloon_page_isolate, + .putback_page = balloon_page_putback, +}; +EXPORT_SYMBOL_GPL(balloon_mops); + +#endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c new file mode 100644 index 000000000..b1efebfcf --- /dev/null +++ b/mm/bootmem_info.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Bootmem core functions. + * + * Copyright (c) 2020, Bytedance. + * + * Author: Muchun Song + * + */ +#include +#include +#include +#include +#include +#include + +void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) +{ + page->index = type; + SetPagePrivate(page); + set_page_private(page, info); + page_ref_inc(page); +} + +void put_page_bootmem(struct page *page) +{ + unsigned long type = page->index; + + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); + + if (page_ref_dec_return(page) == 1) { + page->index = 0; + ClearPagePrivate(page); + set_page_private(page, 0); + INIT_LIST_HEAD(&page->lru); + kmemleak_free_part(page_to_virt(page), PAGE_SIZE); + free_reserved_page(page); + } +} + +#ifndef CONFIG_SPARSEMEM_VMEMMAP +static void __init register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + struct mem_section_usage *usage; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + /* Get section's memmap address */ + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + /* + * Get page for the memmap's phys address + * XXX: need more consideration for sparse_vmemmap... + */ + page = virt_to_page(memmap); + mapsize = sizeof(struct page) * PAGES_PER_SECTION; + mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT; + + /* remember memmap's page */ + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, SECTION_INFO); + + usage = ms->usage; + page = virt_to_page(usage); + + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); + +} +#else /* CONFIG_SPARSEMEM_VMEMMAP */ +static void __init register_page_bootmem_info_section(unsigned long start_pfn) +{ + unsigned long mapsize, section_nr, i; + struct mem_section *ms; + struct page *page, *memmap; + struct mem_section_usage *usage; + + section_nr = pfn_to_section_nr(start_pfn); + ms = __nr_to_section(section_nr); + + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + + register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); + + usage = ms->usage; + page = virt_to_page(usage); + + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; + + for (i = 0; i < mapsize; i++, page++) + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +void __init register_page_bootmem_info_node(struct pglist_data *pgdat) +{ + unsigned long i, pfn, end_pfn, nr_pages; + int node = pgdat->node_id; + struct page *page; + + nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT; + page = virt_to_page(pgdat); + + for (i = 0; i < nr_pages; i++, page++) + get_page_bootmem(node, page, NODE_INFO); + + pfn = pgdat->node_start_pfn; + end_pfn = pgdat_end_pfn(pgdat); + + /* register section info */ + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other nodes. + */ + if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } +} diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 000000000..30b6ca300 --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,588 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski + * Michal Nazarewicz + * Aneesh Kumar K.V + * Joonsoo Kim + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif +#define CREATE_TRACE_POINTS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cma.h" + +struct cma cma_areas[MAX_CMA_AREAS]; +unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(const struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(const struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +const char *cma_get_name(const struct cma *cma) +{ + return cma->name; +} + +static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, + unsigned int align_order) +{ + if (align_order <= cma->order_per_bit) + return 0; + return (1UL << (align_order - cma->order_per_bit)) - 1; +} + +/* + * Find the offset of the base PFN from the specified align_order. + * The value returned is represented in order_per_bits. + */ +static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + unsigned int align_order) +{ + return (cma->base_pfn & ((1UL << align_order) - 1)) + >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, + unsigned long count) +{ + unsigned long bitmap_no, bitmap_count; + unsigned long flags; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + spin_lock_irqsave(&cma->lock, flags); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + spin_unlock_irqrestore(&cma->lock, flags); +} + +static void __init cma_activate_area(struct cma *cma) +{ + unsigned long base_pfn = cma->base_pfn, pfn; + struct zone *zone; + + cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); + if (!cma->bitmap) + goto out_error; + + /* + * alloc_contig_range() requires the pfn range specified to be in the + * same zone. Simplify by forcing the entire CMA resv range to be in the + * same zone. + */ + WARN_ON_ONCE(!pfn_valid(base_pfn)); + zone = page_zone(pfn_to_page(base_pfn)); + for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + if (page_zone(pfn_to_page(pfn)) != zone) + goto not_in_zone; + } + + for (pfn = base_pfn; pfn < base_pfn + cma->count; + pfn += pageblock_nr_pages) + init_cma_reserved_pageblock(pfn_to_page(pfn)); + + spin_lock_init(&cma->lock); + +#ifdef CONFIG_CMA_DEBUGFS + INIT_HLIST_HEAD(&cma->mem_head); + spin_lock_init(&cma->mem_head_lock); +#endif + + return; + +not_in_zone: + bitmap_free(cma->bitmap); +out_error: + /* Expose all pages to the buddy, they are useless for CMA. */ + if (!cma->reserve_pages_on_error) { + for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++) + free_reserved_page(pfn_to_page(pfn)); + } + totalcma_pages -= cma->count; + cma->count = 0; + pr_err("CMA area %s could not be activated\n", cma->name); + return; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) + cma_activate_area(&cma_areas[i]); + + return 0; +} +core_initcall(cma_init_reserved_areas); + +void __init cma_reserve_pages_on_error(struct cma *cma) +{ + cma->reserve_pages_on_error = true; +} + +/** + * cma_init_reserved_mem() - create custom contiguous area from reserved memory + * @base: Base address of the reserved area + * @size: Size of the reserved area (in bytes), + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @name: The name of the area. If this parameter is NULL, the name of + * the area will be set to "cmaN", where N is a running counter of + * used areas. + * @res_cma: Pointer to store the created cma region. + * + * This function creates custom contiguous area from already reserved memory. + */ +int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + unsigned int order_per_bit, + const char *name, + struct cma **res_cma) +{ + struct cma *cma; + + /* Sanity checks */ + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size || !memblock_is_region_reserved(base, size)) + return -EINVAL; + + /* alignment should be aligned with order_per_bit */ + if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit)) + return -EINVAL; + + /* ensure minimal alignment required by mm core */ + if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) + return -EINVAL; + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + + if (name) + snprintf(cma->name, CMA_MAX_NAME, name); + else + snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count); + + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + totalcma_pages += (size / PAGE_SIZE); + + return 0; +} + +/** + * cma_declare_contiguous_nid() - reserve custom contiguous area + * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @fixed: hint about where to place the reserved area + * @name: The name of the area. See function cma_init_reserved_mem() + * @res_cma: Pointer to store the created cma region. + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous_nid(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, const char *name, struct cma **res_cma, + int nid) +{ + phys_addr_t memblock_end = memblock_end_of_DRAM(); + phys_addr_t highmem_start; + int ret = 0; + + /* + * We can't use __pa(high_memory) directly, since high_memory + * isn't a valid direct map VA, and DEBUG_VIRTUAL will (validly) + * complain. Find the boundary by adding one to the last valid + * address. + */ + highmem_start = __pa(high_memory - 1) + 1; + pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", + __func__, &size, &base, &limit, &alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); + if (fixed && base & (alignment - 1)) { + ret = -EINVAL; + pr_err("Region at %pa must be aligned to %pa bytes\n", + &base, &alignment); + goto err; + } + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + if (!base) + fixed = false; + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* + * If allocating at a fixed base the request region must not cross the + * low/high memory boundary. + */ + if (fixed && base < highmem_start && base + size > highmem_start) { + ret = -EINVAL; + pr_err("Region at %pa defined on low/high memory boundary (%pa)\n", + &base, &highmem_start); + goto err; + } + + /* + * If the limit is unspecified or above the memblock end, its effective + * value will be the memblock end. Set it explicitly to simplify further + * checks. + */ + if (limit == 0 || limit > memblock_end) + limit = memblock_end; + + if (base + size > limit) { + ret = -EINVAL; + pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", + &size, &base, &limit); + goto err; + } + + /* Reserve memory */ + if (fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = 0; + + /* + * All pages in the reserved area must come from the same zone. + * If the requested region crosses the low/high memory boundary, + * try allocating from high memory first and fall back to low + * memory in case of failure. + */ + if (base < highmem_start && limit > highmem_start) { + addr = memblock_alloc_range_nid(size, alignment, + highmem_start, limit, nid, true); + limit = highmem_start; + } + + /* + * If there is enough memory, try a bottom-up allocation first. + * It will place the new cma area close to the start of the node + * and guarantee that the compaction is moving pages out of the + * cma area and not into it. + * Avoid using first 4GB to not interfere with constrained zones + * like DMA/DMA32. + */ +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) { + memblock_set_bottom_up(true); + addr = memblock_alloc_range_nid(size, alignment, SZ_4G, + limit, nid, true); + memblock_set_bottom_up(false); + } +#endif + + if (!addr) { + addr = memblock_alloc_range_nid(size, alignment, base, + limit, nid, true); + if (!addr) { + ret = -ENOMEM; + goto err; + } + } + + /* + * kmemleak scans/reads tracked objects for pointers to other + * objects but this address isn't mapped and accessible + */ + kmemleak_ignore_phys(addr); + base = addr; + } + + ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); + if (ret) + goto free_mem; + + pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, + &base); + return 0; + +free_mem: + memblock_phys_free(base, size); +err: + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +#ifdef CONFIG_CMA_DEBUG +static void cma_debug_show_areas(struct cma *cma) +{ + unsigned long next_zero_bit, next_set_bit, nr_zero; + unsigned long start = 0; + unsigned long nr_part, nr_total = 0; + unsigned long nbits = cma_bitmap_maxno(cma); + + spin_lock_irq(&cma->lock); + pr_info("number of available pages: "); + for (;;) { + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); + if (next_zero_bit >= nbits) + break; + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); + nr_zero = next_set_bit - next_zero_bit; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, + next_zero_bit); + nr_total += nr_part; + start = next_zero_bit + nr_zero; + } + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); + spin_unlock_irq(&cma->lock); +} +#else +static inline void cma_debug_show_areas(struct cma *cma) { } +#endif + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * @no_warn: Avoid printing message about failed allocation + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) +{ + unsigned long mask, offset; + unsigned long pfn = -1; + unsigned long start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + unsigned long i; + struct page *page = NULL; + int ret = -ENOMEM; + + if (!cma || !cma->count || !cma->bitmap) + goto out; + + pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + goto out; + + trace_cma_alloc_start(cma->name, count, align); + + mask = cma_bitmap_aligned_mask(cma, align); + offset = cma_bitmap_aligned_offset(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + if (bitmap_count > bitmap_maxno) + goto out; + + for (;;) { + spin_lock_irq(&cma->lock); + bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask, + offset); + if (bitmap_no >= bitmap_maxno) { + spin_unlock_irq(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + spin_unlock_irq(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, + GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } + + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), + count, align); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + trace_cma_alloc_finish(cma->name, pfn, page, count, align); + + /* + * CMA can allocate multiple page blocks, which results in different + * blocks being marked with different tags. Reset the tags to ignore + * those page blocks. + */ + if (page) { + for (i = 0; i < count; i++) + page_kasan_tag_reset(nth_page(page, i)); + } + + if (ret && !no_warn) { + pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", + __func__, cma->name, count, ret); + cma_debug_show_areas(cma); + } + + pr_debug("%s(): returned %p\n", __func__, page); +out: + if (page) { + count_vm_event(CMA_ALLOC_SUCCESS); + cma_sysfs_account_success_pages(cma, count); + } else { + count_vm_event(CMA_ALLOC_FAIL); + if (cma) + cma_sysfs_account_fail_pages(cma, count); + } + + return page; +} + +bool cma_pages_valid(struct cma *cma, const struct page *pages, + unsigned long count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) { + pr_debug("%s(page %p, count %lu)\n", __func__, + (void *)pages, count); + return false; + } + + return true; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by cma_alloc(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) +{ + unsigned long pfn; + + if (!cma_pages_valid(cma, pages, count)) + return false; + + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); + + pfn = page_to_pfn(pages); + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + trace_cma_release(cma->name, pfn, pages, count); + + return true; +} + +int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = it(&cma_areas[i], data); + + if (ret) + return ret; + } + + return 0; +} diff --git a/mm/cma.h b/mm/cma.h new file mode 100644 index 000000000..88a059567 --- /dev/null +++ b/mm/cma.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_CMA_H__ +#define __MM_CMA_H__ + +#include +#include + +struct cma_kobject { + struct kobject kobj; + struct cma *cma; +}; + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + spinlock_t lock; +#ifdef CONFIG_CMA_DEBUGFS + struct hlist_head mem_head; + spinlock_t mem_head_lock; + struct debugfs_u32_array dfs_bitmap; +#endif + char name[CMA_MAX_NAME]; +#ifdef CONFIG_CMA_SYSFS + /* the number of CMA page successful allocations */ + atomic64_t nr_pages_succeeded; + /* the number of CMA page allocation failures */ + atomic64_t nr_pages_failed; + /* kobject requires dynamic object */ + struct cma_kobject *cma_kobj; +#endif + bool reserve_pages_on_error; +}; + +extern struct cma cma_areas[MAX_CMA_AREAS]; +extern unsigned cma_area_count; + +static inline unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +#ifdef CONFIG_CMA_SYSFS +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +#else +static inline void cma_sysfs_account_success_pages(struct cma *cma, + unsigned long nr_pages) {}; +static inline void cma_sysfs_account_fail_pages(struct cma *cma, + unsigned long nr_pages) {}; +#endif +#endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c new file mode 100644 index 000000000..602fff89b --- /dev/null +++ b/mm/cma_debug.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA DebugFS Interface + * + * Copyright (c) 2015 Sasha Levin + */ + + +#include +#include +#include +#include +#include +#include + +#include "cma.h" + +struct cma_mem { + struct hlist_node node; + struct page *p; + unsigned long n; +}; + +static int cma_debugfs_get(void *data, u64 *val) +{ + unsigned long *p = data; + + *val = *p; + + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); + +static int cma_used_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long used; + + spin_lock_irq(&cma->lock); + /* pages counter is smaller than sizeof(int) */ + used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); + spin_unlock_irq(&cma->lock); + *val = (u64)used << cma->order_per_bit; + + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); + +static int cma_maxchunk_get(void *data, u64 *val) +{ + struct cma *cma = data; + unsigned long maxchunk = 0; + unsigned long start, end = 0; + unsigned long bitmap_maxno = cma_bitmap_maxno(cma); + + spin_lock_irq(&cma->lock); + for (;;) { + start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); + if (start >= bitmap_maxno) + break; + end = find_next_bit(cma->bitmap, bitmap_maxno, start); + maxchunk = max(end - start, maxchunk); + } + spin_unlock_irq(&cma->lock); + *val = (u64)maxchunk << cma->order_per_bit; + + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); + +static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) +{ + spin_lock(&cma->mem_head_lock); + hlist_add_head(&mem->node, &cma->mem_head); + spin_unlock(&cma->mem_head_lock); +} + +static struct cma_mem *cma_get_entry_from_list(struct cma *cma) +{ + struct cma_mem *mem = NULL; + + spin_lock(&cma->mem_head_lock); + if (!hlist_empty(&cma->mem_head)) { + mem = hlist_entry(cma->mem_head.first, struct cma_mem, node); + hlist_del_init(&mem->node); + } + spin_unlock(&cma->mem_head_lock); + + return mem; +} + +static int cma_free_mem(struct cma *cma, int count) +{ + struct cma_mem *mem = NULL; + + while (count) { + mem = cma_get_entry_from_list(cma); + if (mem == NULL) + return 0; + + if (mem->n <= count) { + cma_release(cma, mem->p, mem->n); + count -= mem->n; + kfree(mem); + } else if (cma->order_per_bit == 0) { + cma_release(cma, mem->p, count); + mem->p += count; + mem->n -= count; + count = 0; + cma_add_to_cma_mem_list(cma, mem); + } else { + pr_debug("cma: cannot release partial block when order_per_bit != 0\n"); + cma_add_to_cma_mem_list(cma, mem); + break; + } + } + + return 0; + +} + +static int cma_free_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_free_mem(cma, pages); +} +DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); + +static int cma_alloc_mem(struct cma *cma, int count) +{ + struct cma_mem *mem; + struct page *p; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + p = cma_alloc(cma, count, 0, false); + if (!p) { + kfree(mem); + return -ENOMEM; + } + + mem->p = p; + mem->n = count; + + cma_add_to_cma_mem_list(cma, mem); + + return 0; +} + +static int cma_alloc_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_alloc_mem(cma, pages); +} +DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); + +static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) +{ + struct dentry *tmp; + + tmp = debugfs_create_dir(cma->name, root_dentry); + + debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); + debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); + debugfs_create_file("base_pfn", 0444, tmp, + &cma->base_pfn, &cma_debugfs_fops); + debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops); + debugfs_create_file("order_per_bit", 0444, tmp, + &cma->order_per_bit, &cma_debugfs_fops); + debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops); + debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops); + + cma->dfs_bitmap.array = (u32 *)cma->bitmap; + cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma), + BITS_PER_BYTE * sizeof(u32)); + debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap); +} + +static int __init cma_debugfs_init(void) +{ + struct dentry *cma_debugfs_root; + int i; + + cma_debugfs_root = debugfs_create_dir("cma", NULL); + + for (i = 0; i < cma_area_count; i++) + cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); + + return 0; +} +late_initcall(cma_debugfs_init); diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c new file mode 100644 index 000000000..eb2f39caf --- /dev/null +++ b/mm/cma_sysfs.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA SysFS Interface + * + * Copyright (c) 2021 Minchan Kim + */ + +#include +#include +#include + +#include "cma.h" + +#define CMA_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_succeeded); +} + +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_failed); +} + +static inline struct cma *cma_from_kobj(struct kobject *kobj) +{ + return container_of(kobj, struct cma_kobject, kobj)->cma; +} + +static ssize_t alloc_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", + atomic64_read(&cma->nr_pages_succeeded)); +} +CMA_ATTR_RO(alloc_pages_success); + +static ssize_t alloc_pages_fail_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed)); +} +CMA_ATTR_RO(alloc_pages_fail); + +static void cma_kobj_release(struct kobject *kobj) +{ + struct cma *cma = cma_from_kobj(kobj); + struct cma_kobject *cma_kobj = cma->cma_kobj; + + kfree(cma_kobj); + cma->cma_kobj = NULL; +} + +static struct attribute *cma_attrs[] = { + &alloc_pages_success_attr.attr, + &alloc_pages_fail_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cma); + +static struct kobj_type cma_ktype = { + .release = cma_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = cma_groups, +}; + +static int __init cma_sysfs_init(void) +{ + struct kobject *cma_kobj_root; + struct cma_kobject *cma_kobj; + struct cma *cma; + int i, err; + + cma_kobj_root = kobject_create_and_add("cma", mm_kobj); + if (!cma_kobj_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) { + cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL); + if (!cma_kobj) { + err = -ENOMEM; + goto out; + } + + cma = &cma_areas[i]; + cma->cma_kobj = cma_kobj; + cma_kobj->cma = cma; + err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype, + cma_kobj_root, "%s", cma->name); + if (err) { + kobject_put(&cma_kobj->kobj); + goto out; + } + } + + return 0; +out: + while (--i >= 0) { + cma = &cma_areas[i]; + kobject_put(&cma->cma_kobj->kobj); + } + kobject_put(cma_kobj_root); + + return err; +} +subsys_initcall(cma_sysfs_init); diff --git a/mm/compaction.c b/mm/compaction.c new file mode 100644 index 000000000..8238e8338 --- /dev/null +++ b/mm/compaction.c @@ -0,0 +1,3068 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/compaction.c + * + * Memory compaction for the reduction of external fragmentation. Note that + * this heavily depends upon page migration to do all the real heavy + * lifting + * + * Copyright IBM Corp. 2007-2010 Mel Gorman + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef CONFIG_COMPACTION +/* + * Fragmentation score check interval for proactive compaction purposes. + */ +#define HPAGE_FRAG_CHECK_INTERVAL_MSEC (500) + +static inline void count_compact_event(enum vm_event_item item) +{ + count_vm_event(item); +} + +static inline void count_compact_events(enum vm_event_item item, long delta) +{ + count_vm_events(item, delta); +} +#else +#define count_compact_event(item) do { } while (0) +#define count_compact_events(item, delta) do { } while (0) +#endif + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + +#define CREATE_TRACE_POINTS +#include + +#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) +#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) + +/* + * Page order with-respect-to which proactive compaction + * calculates external fragmentation, which is used as + * the "fragmentation score" of a node/zone. + */ +#if defined CONFIG_TRANSPARENT_HUGEPAGE +#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER +#elif defined CONFIG_HUGETLBFS +#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER +#else +#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) +#endif + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long high_pfn = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + unsigned long pfn = page_to_pfn(page); + list_del(&page->lru); + __free_page(page); + if (pfn > high_pfn) + high_pfn = pfn; + } + + return high_pfn; +} + +static void split_map_pages(struct list_head *list) +{ + unsigned int i, order, nr_pages; + struct page *page, *next; + LIST_HEAD(tmp_list); + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + + order = page_private(page); + nr_pages = 1 << order; + + post_alloc_hook(page, order, __GFP_MOVABLE); + if (order) + split_page(page, order); + + for (i = 0; i < nr_pages; i++) { + list_add(&page->lru, &tmp_list); + page++; + } + } + + list_splice(&tmp_list, list); +} + +#ifdef CONFIG_COMPACTION +bool PageMovable(struct page *page) +{ + const struct movable_operations *mops; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + if (!__PageMovable(page)) + return false; + + mops = page_movable_ops(page); + if (mops) + return true; + + return false; +} +EXPORT_SYMBOL(PageMovable); + +void __SetPageMovable(struct page *page, const struct movable_operations *mops) +{ + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE); +} +EXPORT_SYMBOL(__SetPageMovable); + +void __ClearPageMovable(struct page *page) +{ + VM_BUG_ON_PAGE(!PageMovable(page), page); + /* + * This page still has the type of a movable page, but it's + * actually not movable any more. + */ + page->mapping = (void *)PAGE_MAPPING_MOVABLE; +} +EXPORT_SYMBOL(__ClearPageMovable); + +/* Do not skip compaction more than 64 times */ +#define COMPACT_MAX_DEFER_SHIFT 6 + +/* + * Compaction is deferred when compaction fails to result in a page + * allocation success. 1 << compact_defer_shift, compactions are skipped up + * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT + */ +static void defer_compaction(struct zone *zone, int order) +{ + zone->compact_considered = 0; + zone->compact_defer_shift++; + + if (order < zone->compact_order_failed) + zone->compact_order_failed = order; + + if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) + zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; + + trace_mm_compaction_defer_compaction(zone, order); +} + +/* Returns true if compaction should be skipped this time */ +static bool compaction_deferred(struct zone *zone, int order) +{ + unsigned long defer_limit = 1UL << zone->compact_defer_shift; + + if (order < zone->compact_order_failed) + return false; + + /* Avoid possible overflow */ + if (++zone->compact_considered >= defer_limit) { + zone->compact_considered = defer_limit; + return false; + } + + trace_mm_compaction_deferred(zone, order); + + return true; +} + +/* + * Update defer tracking counters after successful compaction of given order, + * which means an allocation either succeeded (alloc_success == true) or is + * expected to succeed. + */ +void compaction_defer_reset(struct zone *zone, int order, + bool alloc_success) +{ + if (alloc_success) { + zone->compact_considered = 0; + zone->compact_defer_shift = 0; + } + if (order >= zone->compact_order_failed) + zone->compact_order_failed = order + 1; + + trace_mm_compaction_defer_reset(zone, order); +} + +/* Returns true if restarting compaction after many failures */ +static bool compaction_restarting(struct zone *zone, int order) +{ + if (order < zone->compact_order_failed) + return false; + + return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && + zone->compact_considered >= 1UL << zone->compact_defer_shift; +} + +/* Returns true if the pageblock should be scanned for pages to isolate. */ +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + if (cc->ignore_skip_hint) + return true; + + return !get_pageblock_skip(page); +} + +static void reset_cached_positions(struct zone *zone) +{ + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; + zone->compact_cached_free_pfn = + pageblock_start_pfn(zone_end_pfn(zone) - 1); +} + +/* + * Compound pages of >= pageblock_order should consistently be skipped until + * released. It is always pointless to compact pages of such order (if they are + * migratable), and the pageblocks they occupy cannot contain any free pages. + */ +static bool pageblock_skip_persistent(struct page *page) +{ + if (!PageCompound(page)) + return false; + + page = compound_head(page); + + if (compound_order(page) >= pageblock_order) + return true; + + return false; +} + +static bool +__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, + bool check_target) +{ + struct page *page = pfn_to_online_page(pfn); + struct page *block_page; + struct page *end_page; + unsigned long block_pfn; + + if (!page) + return false; + if (zone != page_zone(page)) + return false; + if (pageblock_skip_persistent(page)) + return false; + + /* + * If skip is already cleared do no further checking once the + * restart points have been set. + */ + if (check_source && check_target && !get_pageblock_skip(page)) + return true; + + /* + * If clearing skip for the target scanner, do not select a + * non-movable pageblock as the starting point. + */ + if (!check_source && check_target && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + return false; + + /* Ensure the start of the pageblock or zone is online and valid */ + block_pfn = pageblock_start_pfn(pfn); + block_pfn = max(block_pfn, zone->zone_start_pfn); + block_page = pfn_to_online_page(block_pfn); + if (block_page) { + page = block_page; + pfn = block_pfn; + } + + /* Ensure the end of the pageblock or zone is online and valid */ + block_pfn = pageblock_end_pfn(pfn) - 1; + block_pfn = min(block_pfn, zone_end_pfn(zone) - 1); + end_page = pfn_to_online_page(block_pfn); + if (!end_page) + return false; + + /* + * Only clear the hint if a sample indicates there is either a + * free page or an LRU page in the block. One or other condition + * is necessary for the block to be a migration source/target. + */ + do { + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } + + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; + } + + page += (1 << PAGE_ALLOC_COSTLY_ORDER); + } while (page <= end_page); + + return false; +} + +/* + * This function is called to clear all cached information on pageblocks that + * should be skipped for page isolation when the migrate and free page scanner + * meet. + */ +static void __reset_isolation_suitable(struct zone *zone) +{ + unsigned long migrate_pfn = zone->zone_start_pfn; + unsigned long free_pfn = zone_end_pfn(zone) - 1; + unsigned long reset_migrate = free_pfn; + unsigned long reset_free = migrate_pfn; + bool source_set = false; + bool free_set = false; + + if (!zone->compact_blockskip_flush) + return; + + zone->compact_blockskip_flush = false; + + /* + * Walk the zone and update pageblock skip information. Source looks + * for PageLRU while target looks for PageBuddy. When the scanner + * is found, both PageBuddy and PageLRU are checked as the pageblock + * is suitable as both source and target. + */ + for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, + free_pfn -= pageblock_nr_pages) { + cond_resched(); + + /* Update the migrate PFN */ + if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && + migrate_pfn < reset_migrate) { + source_set = true; + reset_migrate = migrate_pfn; + zone->compact_init_migrate_pfn = reset_migrate; + zone->compact_cached_migrate_pfn[0] = reset_migrate; + zone->compact_cached_migrate_pfn[1] = reset_migrate; + } + + /* Update the free PFN */ + if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && + free_pfn > reset_free) { + free_set = true; + reset_free = free_pfn; + zone->compact_init_free_pfn = reset_free; + zone->compact_cached_free_pfn = reset_free; + } + } + + /* Leave no distance if no suitable block was reset */ + if (reset_migrate >= reset_free) { + zone->compact_cached_migrate_pfn[0] = migrate_pfn; + zone->compact_cached_migrate_pfn[1] = migrate_pfn; + zone->compact_cached_free_pfn = free_pfn; + } +} + +void reset_isolation_suitable(pg_data_t *pgdat) +{ + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Only flush if a full compaction finished recently */ + if (zone->compact_blockskip_flush) + __reset_isolation_suitable(zone); + } +} + +/* + * Sets the pageblock skip bit if it was clear. Note that this is a hint as + * locks are not required for read/writers. Returns true if it was already set. + */ +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) +{ + bool skip; + + /* Do no update if skip hint is being ignored */ + if (cc->ignore_skip_hint) + return false; + + if (!pageblock_aligned(pfn)) + return false; + + skip = get_pageblock_skip(page); + if (!skip && !cc->no_set_skip_hint) + set_pageblock_skip(page); + + return skip; +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ + struct zone *zone = cc->zone; + + pfn = pageblock_end_pfn(pfn); + + /* Set for isolation rather than compaction */ + if (cc->no_set_skip_hint) + return; + + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; +} + +/* + * If no pages were isolated then mark this pageblock to be skipped in the + * future. The information is later cleared by __reset_isolation_suitable(). + */ +static void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long pfn) +{ + struct zone *zone = cc->zone; + + if (cc->no_set_skip_hint) + return; + + if (!page) + return; + + set_pageblock_skip(page); + + /* Update where async and sync compaction should restart */ + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; +} +#else +static inline bool isolation_suitable(struct compact_control *cc, + struct page *page) +{ + return true; +} + +static inline bool pageblock_skip_persistent(struct page *page) +{ + return false; +} + +static inline void update_pageblock_skip(struct compact_control *cc, + struct page *page, unsigned long pfn) +{ +} + +static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) +{ +} + +static bool test_and_set_skip(struct compact_control *cc, struct page *page, + unsigned long pfn) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. For async compaction, trylock and record if the + * lock is contended. The lock will still be acquired but compaction will + * abort when the current block is finished regardless of success rate. + * Sync compaction acquires the lock. + * + * Always returns true which makes it easier to track lock state in callers. + */ +static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, + struct compact_control *cc) + __acquires(lock) +{ + /* Track if the lock is contended in async mode */ + if (cc->mode == MIGRATE_ASYNC && !cc->contended) { + if (spin_trylock_irqsave(lock, *flags)) + return true; + + cc->contended = true; + } + + spin_lock_irqsave(lock, *flags); + return true; +} + +/* + * Compaction requires the taking of some coarse locks that are potentially + * very heavily contended. The lock should be periodically unlocked to avoid + * having disabled IRQs for a long time, even when there is nobody waiting on + * the lock. It might also be that allowing the IRQs will result in + * need_resched() becoming true. If scheduling is needed, compaction schedules. + * Either compaction type will also abort if a fatal signal is pending. + * In either case if the lock was locked, it is dropped and not regained. + * + * Returns true if compaction should abort due to fatal signal pending. + * Returns false when compaction can continue. + */ +static bool compact_unlock_should_abort(spinlock_t *lock, + unsigned long flags, bool *locked, struct compact_control *cc) +{ + if (*locked) { + spin_unlock_irqrestore(lock, flags); + *locked = false; + } + + if (fatal_signal_pending(current)) { + cc->contended = true; + return true; + } + + cond_resched(); + + return false; +} + +/* + * Isolate free pages onto a private freelist. If @strict is true, will abort + * returning 0 on any invalid PFNs or non-free pages inside of the pageblock + * (even though it may still end up isolating some pages). + */ +static unsigned long isolate_freepages_block(struct compact_control *cc, + unsigned long *start_pfn, + unsigned long end_pfn, + struct list_head *freelist, + unsigned int stride, + bool strict) +{ + int nr_scanned = 0, total_isolated = 0; + struct page *cursor; + unsigned long flags = 0; + bool locked = false; + unsigned long blockpfn = *start_pfn; + unsigned int order; + + /* Strict mode is for isolation, speed is secondary */ + if (strict) + stride = 1; + + cursor = pfn_to_page(blockpfn); + + /* Isolate free pages. */ + for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { + int isolated; + struct page *page = cursor; + + /* + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort if fatal signal + * pending. + */ + if (!(blockpfn % COMPACT_CLUSTER_MAX) + && compact_unlock_should_abort(&cc->zone->lock, flags, + &locked, cc)) + break; + + nr_scanned++; + + /* + * For compound pages such as THP and hugetlbfs, we can save + * potentially a lot of iterations if we skip them at once. + * The check is racy, but we can consider only valid values + * and the only danger is skipping too much. + */ + if (PageCompound(page)) { + const unsigned int order = compound_order(page); + + if (likely(order < MAX_ORDER)) { + blockpfn += (1UL << order) - 1; + cursor += (1UL << order) - 1; + } + goto isolate_fail; + } + + if (!PageBuddy(page)) + goto isolate_fail; + + /* If we already hold the lock, we can skip some rechecking. */ + if (!locked) { + locked = compact_lock_irqsave(&cc->zone->lock, + &flags, cc); + + /* Recheck this is a buddy page under lock */ + if (!PageBuddy(page)) + goto isolate_fail; + } + + /* Found a free page, will break it into order-0 pages */ + order = buddy_order(page); + isolated = __isolate_free_page(page, order); + if (!isolated) + break; + set_page_private(page, order); + + nr_scanned += isolated - 1; + total_isolated += isolated; + cc->nr_freepages += isolated; + list_add_tail(&page->lru, freelist); + + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; + } + /* Advance to the end of split page */ + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; + +isolate_fail: + if (strict) + break; + else + continue; + + } + + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* + * There is a tiny chance that we have read bogus compound_order(), + * so be careful to not go outside of the pageblock. + */ + if (unlikely(blockpfn > end_pfn)) + blockpfn = end_pfn; + + trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, + nr_scanned, total_isolated); + + /* Record how far we have got within the block */ + *start_pfn = blockpfn; + + /* + * If strict isolation is requested by CMA then check that all the + * pages requested were isolated. If there were any failures, 0 is + * returned and CMA will fail. + */ + if (strict && blockpfn < end_pfn) + total_isolated = 0; + + cc->total_free_scanned += nr_scanned; + if (total_isolated) + count_compact_events(COMPACTISOLATED, total_isolated); + return total_isolated; +} + +/** + * isolate_freepages_range() - isolate free pages. + * @cc: Compaction control structure. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Non-free pages, invalid PFNs, or zone boundaries within the + * [start_pfn, end_pfn) range are considered errors, cause function to + * undo its actions and return zero. + * + * Otherwise, function returns one-past-the-last PFN of isolated page + * (which may be greater then end_pfn if end fell in a middle of + * a free page). + */ +unsigned long +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; + LIST_HEAD(freelist); + + pfn = start_pfn; + block_start_pfn = pageblock_start_pfn(pfn); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; + block_end_pfn = pageblock_end_pfn(pfn); + + for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + /* Protect pfn from changing by isolate_freepages_block */ + unsigned long isolate_start_pfn = pfn; + + block_end_pfn = min(block_end_pfn, end_pfn); + + /* + * pfn could pass the block_end_pfn if isolated freepage + * is more than pageblock order. In this case, we adjust + * scanning range to right one. + */ + if (pfn >= block_end_pfn) { + block_start_pfn = pageblock_start_pfn(pfn); + block_end_pfn = pageblock_end_pfn(pfn); + block_end_pfn = min(block_end_pfn, end_pfn); + } + + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) + break; + + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, &freelist, 0, true); + + /* + * In strict mode, isolate_freepages_block() returns 0 if + * there are any holes in the block (ie. invalid PFNs or + * non-free pages). + */ + if (!isolated) + break; + + /* + * If we managed to isolate pages, it is always (1 << n) * + * pageblock_nr_pages for some non-negative n. (Max order + * page may span two pageblocks). + */ + } + + /* __isolate_free_page() does not map the pages */ + split_map_pages(&freelist); + + if (pfn < end_pfn) { + /* Loop terminated early, cleanup. */ + release_freepages(&freelist); + return 0; + } + + /* We don't use freelists for anything. */ + return pfn; +} + +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(pg_data_t *pgdat) +{ + bool too_many; + + unsigned long active, inactive, isolated; + + inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_ANON); + active = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + + node_page_state(pgdat, NR_ISOLATED_ANON); + + too_many = isolated > (inactive + active) / 2; + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; +} + +/** + * isolate_migratepages_block() - isolate all migrate-able pages within + * a single pageblock + * @cc: Compaction control structure. + * @low_pfn: The first PFN to isolate + * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock + * @mode: Isolation mode to be used. + * + * Isolate all pages that can be migrated from the range specified by + * [low_pfn, end_pfn). The range is expected to be within same pageblock. + * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, + * -ENOMEM in case we could not allocate a page, or 0. + * cc->migrate_pfn will contain the next pfn to scan. + * + * The pages are isolated on cc->migratepages list (not required to be empty), + * and cc->nr_migratepages is updated accordingly. + */ +static int +isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, + unsigned long end_pfn, isolate_mode_t mode) +{ + pg_data_t *pgdat = cc->zone->zone_pgdat; + unsigned long nr_scanned = 0, nr_isolated = 0; + struct lruvec *lruvec; + unsigned long flags = 0; + struct lruvec *locked = NULL; + struct page *page = NULL, *valid_page = NULL; + struct address_space *mapping; + unsigned long start_pfn = low_pfn; + bool skip_on_failure = false; + unsigned long next_skip_pfn = 0; + bool skip_updated = false; + int ret = 0; + + cc->migrate_pfn = low_pfn; + + /* + * Ensure that there are not too many pages isolated from the LRU + * list by either parallel reclaimers or compaction. If there are, + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(pgdat))) { + /* stop isolation if there are still pages not migrated */ + if (cc->nr_migratepages) + return -EAGAIN; + + /* async migration should just abort */ + if (cc->mode == MIGRATE_ASYNC) + return -EAGAIN; + + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); + + if (fatal_signal_pending(current)) + return -EINTR; + } + + cond_resched(); + + if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { + skip_on_failure = true; + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + + /* Time to isolate some pages for migration */ + for (; low_pfn < end_pfn; low_pfn++) { + + if (skip_on_failure && low_pfn >= next_skip_pfn) { + /* + * We have isolated all migration candidates in the + * previous order-aligned block, and did not skip it due + * to failure. We should migrate the pages now and + * hopefully succeed compaction. + */ + if (nr_isolated) + break; + + /* + * We failed to isolate in the previous order-aligned + * block. Set the new boundary to the end of the + * current block. Note we can't simply increase + * next_skip_pfn by 1 << order, as low_pfn might have + * been incremented by a higher number due to skipping + * a compound or a high-order buddy page in the + * previous loop iteration. + */ + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + + /* + * Periodically drop the lock (if held) regardless of its + * contention, to give chance to IRQs. Abort completely if + * a fatal signal is pending. + */ + if (!(low_pfn % COMPACT_CLUSTER_MAX)) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + + if (fatal_signal_pending(current)) { + cc->contended = true; + ret = -EINTR; + + goto fatal_pending; + } + + cond_resched(); + } + + nr_scanned++; + + page = pfn_to_page(low_pfn); + + /* + * Check if the pageblock has already been marked skipped. + * Only the aligned PFN is checked as the caller isolates + * COMPACT_CLUSTER_MAX at a time so the second call must + * not falsely conclude that the block should be skipped. + */ + if (!valid_page && pageblock_aligned(low_pfn)) { + if (!isolation_suitable(cc, page)) { + low_pfn = end_pfn; + page = NULL; + goto isolate_abort; + } + valid_page = page; + } + + if (PageHuge(page) && cc->alloc_contig) { + ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + + /* + * Fail isolation in case isolate_or_dissolve_huge_page() + * reports an error. In case of -ENOMEM, abort right away. + */ + if (ret < 0) { + /* Do not report -EBUSY down the chain */ + if (ret == -EBUSY) + ret = 0; + low_pfn += compound_nr(page) - 1; + goto isolate_fail; + } + + if (PageHuge(page)) { + /* + * Hugepage was successfully isolated and placed + * on the cc->migratepages list. + */ + low_pfn += compound_nr(page) - 1; + goto isolate_success_no_list; + } + + /* + * Ok, the hugepage was dissolved. Now these pages are + * Buddy and cannot be re-allocated because they are + * isolated. Fall-through as the check below handles + * Buddy pages. + */ + } + + /* + * Skip if free. We read page order here without zone lock + * which is generally unsafe, but the race window is small and + * the worst thing that can happen is that we skip some + * potential isolation targets. + */ + if (PageBuddy(page)) { + unsigned long freepage_order = buddy_order_unsafe(page); + + /* + * Without lock, we cannot be sure that what we got is + * a valid page order. Consider only values in the + * valid order range to prevent low_pfn overflow. + */ + if (freepage_order > 0 && freepage_order < MAX_ORDER) + low_pfn += (1UL << freepage_order) - 1; + continue; + } + + /* + * Regardless of being on LRU, compound pages such as THP and + * hugetlbfs are not to be compacted unless we are attempting + * an allocation much larger than the huge page size (eg CMA). + * We can potentially save a lot of iterations if we skip them + * at once. The check is racy, but we can consider only valid + * values and the only danger is skipping too much. + */ + if (PageCompound(page) && !cc->alloc_contig) { + const unsigned int order = compound_order(page); + + if (likely(order < MAX_ORDER)) + low_pfn += (1UL << order) - 1; + goto isolate_fail; + } + + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU and non-lru movable pages. + * Skip any other type of page + */ + if (!PageLRU(page)) { + /* + * __PageMovable can return false positive so we need + * to verify it under page_lock. + */ + if (unlikely(__PageMovable(page)) && + !PageIsolated(page)) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + + if (!isolate_movable_page(page, mode)) + goto isolate_success; + } + + goto isolate_fail; + } + + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) + goto isolate_fail; + + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + mapping = page_mapping(page); + if (!mapping && (page_count(page) - 1) > total_mapcount(page)) + goto isolate_fail_put; + + /* + * Only allow to migrate anonymous pages in GFP_NOFS context + * because those do not depend on fs locks. + */ + if (!(cc->gfp_mask & __GFP_FS) && mapping) + goto isolate_fail_put; + + /* Only take pages on LRU: a check now makes later tests safe */ + if (!PageLRU(page)) + goto isolate_fail_put; + + /* Compaction might skip unevictable pages but CMA takes them */ + if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page)) + goto isolate_fail_put; + + /* + * To minimise LRU disruption, the caller can indicate with + * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages + * it will be able to migrate without blocking - clean pages + * for the most part. PageWriteback would require blocking. + */ + if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page)) + goto isolate_fail_put; + + if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) { + bool migrate_dirty; + + /* + * Only pages without mappings or that have a + * ->migrate_folio callback are possible to migrate + * without blocking. However, we can be racing with + * truncation so it's necessary to lock the page + * to stabilise the mapping as truncation holds + * the page lock until after the page is removed + * from the page cache. + */ + if (!trylock_page(page)) + goto isolate_fail_put; + + mapping = page_mapping(page); + migrate_dirty = !mapping || + mapping->a_ops->migrate_folio; + unlock_page(page); + if (!migrate_dirty) + goto isolate_fail_put; + } + + /* Try isolate the page */ + if (!TestClearPageLRU(page)) + goto isolate_fail_put; + + lruvec = folio_lruvec(page_folio(page)); + + /* If we already hold the lock, we can skip some rechecking */ + if (lruvec != locked) { + if (locked) + unlock_page_lruvec_irqrestore(locked, flags); + + compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + locked = lruvec; + + lruvec_memcg_debug(lruvec, page_folio(page)); + + /* Try get exclusive access under lock */ + if (!skip_updated) { + skip_updated = true; + if (test_and_set_skip(cc, page, low_pfn)) + goto isolate_abort; + } + + /* + * Page become compound since the non-locked check, + * and it's on LRU. It can only be a THP so the order + * is safe to read and it's 0 for tail pages. + */ + if (unlikely(PageCompound(page) && !cc->alloc_contig)) { + low_pfn += compound_nr(page) - 1; + SetPageLRU(page); + goto isolate_fail_put; + } + } + + /* The whole page is taken off the LRU; skip the tail pages. */ + if (PageCompound(page)) + low_pfn += compound_nr(page) - 1; + + /* Successfully isolated */ + del_page_from_lru_list(page, lruvec); + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + thp_nr_pages(page)); + +isolate_success: + list_add(&page->lru, &cc->migratepages); +isolate_success_no_list: + cc->nr_migratepages += compound_nr(page); + nr_isolated += compound_nr(page); + nr_scanned += compound_nr(page) - 1; + + /* + * Avoid isolating too much unless this block is being + * rescanned (e.g. dirty/writeback pages, parallel allocation) + * or a lock is contended. For contention, isolate quickly to + * potentially remove one source of contention. + */ + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX && + !cc->rescan && !cc->contended) { + ++low_pfn; + break; + } + + continue; + +isolate_fail_put: + /* Avoid potential deadlock in freeing page under lru_lock */ + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + put_page(page); + +isolate_fail: + if (!skip_on_failure && ret != -ENOMEM) + continue; + + /* + * We have isolated some pages, but then failed. Release them + * instead of migrating, as we cannot form the cc->order buddy + * page anyway. + */ + if (nr_isolated) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + nr_isolated = 0; + } + + if (low_pfn < next_skip_pfn) { + low_pfn = next_skip_pfn - 1; + /* + * The check near the loop beginning would have updated + * next_skip_pfn too, but this is a bit simpler. + */ + next_skip_pfn += 1UL << cc->order; + } + + if (ret == -ENOMEM) + break; + } + + /* + * The PageBuddy() check could have potentially brought us outside + * the range to be scanned. + */ + if (unlikely(low_pfn > end_pfn)) + low_pfn = end_pfn; + + page = NULL; + +isolate_abort: + if (locked) + unlock_page_lruvec_irqrestore(locked, flags); + if (page) { + SetPageLRU(page); + put_page(page); + } + + /* + * Updated the cached scanner pfn once the pageblock has been scanned + * Pages will either be migrated in which case there is no point + * scanning in the near future or migration failed in which case the + * failure reason may persist. The block is marked for skipping if + * there were no pages isolated in the block or if the block is + * rescanned twice in a row. + */ + if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { + if (valid_page && !skip_updated) + set_pageblock_skip(valid_page); + update_cached_migrate(cc, low_pfn); + } + + trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, + nr_scanned, nr_isolated); + +fatal_pending: + cc->total_migrate_scanned += nr_scanned; + if (nr_isolated) + count_compact_events(COMPACTISOLATED, nr_isolated); + + cc->migrate_pfn = low_pfn; + + return ret; +} + +/** + * isolate_migratepages_range() - isolate migrate-able pages in a PFN range + * @cc: Compaction control structure. + * @start_pfn: The first PFN to start isolating. + * @end_pfn: The one-past-last PFN. + * + * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM + * in case we could not allocate a page, or 0. + */ +int +isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn, block_start_pfn, block_end_pfn; + int ret = 0; + + /* Scan block by block. First and last block may be incomplete */ + pfn = start_pfn; + block_start_pfn = pageblock_start_pfn(pfn); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; + block_end_pfn = pageblock_end_pfn(pfn); + + for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, end_pfn); + + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) + continue; + + ret = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); + + if (ret) + break; + + if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) + break; + } + + return ret; +} + +#endif /* CONFIG_COMPACTION || CONFIG_CMA */ +#ifdef CONFIG_COMPACTION + +static bool suitable_migration_source(struct compact_control *cc, + struct page *page) +{ + int block_mt; + + if (pageblock_skip_persistent(page)) + return false; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) + return true; + + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) +{ + /* If the page is a large free page, then disallow migration */ + if (PageBuddy(page)) { + /* + * We are checking page_order without zone->lock taken. But + * the only small danger is that we skip a potentially suitable + * pageblock, so it's not worth to check order for valid range. + */ + if (buddy_order_unsafe(page) >= pageblock_order) + return false; + } + + if (cc->ignore_block_suitable) + return true; + + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ + if (is_migrate_movable(get_pageblock_migratetype(page))) + return true; + + /* Otherwise skip the block */ + return false; +} + +static inline unsigned int +freelist_scan_limit(struct compact_control *cc) +{ + unsigned short shift = BITS_PER_LONG - 1; + + return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1; +} + +/* + * Test whether the free scanner has reached the same or lower pageblock than + * the migration scanner, and compaction should thus terminate. + */ +static inline bool compact_scanners_met(struct compact_control *cc) +{ + return (cc->free_pfn >> pageblock_order) + <= (cc->migrate_pfn >> pageblock_order); +} + +/* + * Used when scanning for a suitable migration target which scans freelists + * in reverse. Reorders the list such as the unscanned pages are scanned + * first on the next iteration of the free scanner + */ +static void +move_freelist_head(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_last(freelist, &freepage->lru)) { + list_cut_before(&sublist, freelist, &freepage->lru); + list_splice_tail(&sublist, freelist); + } +} + +/* + * Similar to move_freelist_head except used by the migration scanner + * when scanning forward. It's possible for these list operations to + * move against each other if they search the free list exactly in + * lockstep. + */ +static void +move_freelist_tail(struct list_head *freelist, struct page *freepage) +{ + LIST_HEAD(sublist); + + if (!list_is_first(freelist, &freepage->lru)) { + list_cut_position(&sublist, freelist, &freepage->lru); + list_splice_tail(&sublist, freelist); + } +} + +static void +fast_isolate_around(struct compact_control *cc, unsigned long pfn) +{ + unsigned long start_pfn, end_pfn; + struct page *page; + + /* Do not search around if there are enough pages already */ + if (cc->nr_freepages >= cc->nr_migratepages) + return; + + /* Minimise scanning during async compaction */ + if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC) + return; + + /* Pageblock boundaries */ + start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn); + end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)); + + page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone); + if (!page) + return; + + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + + /* Skip this pageblock in the future as it's full or nearly full */ + if (cc->nr_freepages < cc->nr_migratepages) + set_pageblock_skip(page); + + return; +} + +/* Search orders in round-robin fashion */ +static int next_search_order(struct compact_control *cc, int order) +{ + order--; + if (order < 0) + order = cc->order - 1; + + /* Search wrapped around? */ + if (order == cc->search_order) { + cc->search_order--; + if (cc->search_order < 0) + cc->search_order = cc->order - 1; + return -1; + } + + return order; +} + +static unsigned long +fast_isolate_freepages(struct compact_control *cc) +{ + unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1); + unsigned int nr_scanned = 0; + unsigned long low_pfn, min_pfn, highest = 0; + unsigned long nr_isolated = 0; + unsigned long distance; + struct page *page = NULL; + bool scan_start = false; + int order; + + /* Full compaction passes in a negative order */ + if (cc->order <= 0) + return cc->free_pfn; + + /* + * If starting the scan, use a deeper search and use the highest + * PFN found if a suitable one is not found. + */ + if (cc->free_pfn >= cc->zone->compact_init_free_pfn) { + limit = pageblock_nr_pages >> 1; + scan_start = true; + } + + /* + * Preferred point is in the top quarter of the scan space but take + * a pfn from the top half if the search is problematic. + */ + distance = (cc->free_pfn - cc->migrate_pfn); + low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2)); + min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); + + if (WARN_ON_ONCE(min_pfn > low_pfn)) + low_pfn = min_pfn; + + /* + * Search starts from the last successful isolation order or the next + * order to search after a previous failure + */ + cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order); + + for (order = cc->search_order; + !page && order >= 0; + order = next_search_order(cc, order)) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + struct page *freepage; + unsigned long flags; + unsigned int order_scanned = 0; + unsigned long high_pfn = 0; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry_reverse(freepage, freelist, lru) { + unsigned long pfn; + + order_scanned++; + nr_scanned++; + pfn = page_to_pfn(freepage); + + if (pfn >= highest) + highest = max(pageblock_start_pfn(pfn), + cc->zone->zone_start_pfn); + + if (pfn >= low_pfn) { + cc->fast_search_fail = 0; + cc->search_order = order; + page = freepage; + break; + } + + if (pfn >= min_pfn && pfn > high_pfn) { + high_pfn = pfn; + + /* Shorten the scan if a candidate is found */ + limit >>= 1; + } + + if (order_scanned >= limit) + break; + } + + /* Use a minimum pfn if a preferred one was not found */ + if (!page && high_pfn) { + page = pfn_to_page(high_pfn); + + /* Update freepage for the list reorder below */ + freepage = page; + } + + /* Reorder to so a future search skips recent pages */ + move_freelist_head(freelist, freepage); + + /* Isolate the page if available */ + if (page) { + if (__isolate_free_page(page, order)) { + set_page_private(page, order); + nr_isolated = 1 << order; + nr_scanned += nr_isolated - 1; + cc->nr_freepages += nr_isolated; + list_add_tail(&page->lru, &cc->freepages); + count_compact_events(COMPACTISOLATED, nr_isolated); + } else { + /* If isolation fails, abort the search */ + order = cc->search_order + 1; + page = NULL; + } + } + + spin_unlock_irqrestore(&cc->zone->lock, flags); + + /* + * Smaller scan on next order so the total scan is related + * to freelist_scan_limit. + */ + if (order_scanned >= limit) + limit = max(1U, limit >> 1); + } + + if (!page) { + cc->fast_search_fail++; + if (scan_start) { + /* + * Use the highest PFN found above min. If one was + * not found, be pessimistic for direct compaction + * and use the min mark. + */ + if (highest >= min_pfn) { + page = pfn_to_page(highest); + cc->free_pfn = highest; + } else { + if (cc->direct_compaction && pfn_valid(min_pfn)) { + page = pageblock_pfn_to_page(min_pfn, + min(pageblock_end_pfn(min_pfn), + zone_end_pfn(cc->zone)), + cc->zone); + cc->free_pfn = min_pfn; + } + } + } + } + + if (highest && highest >= cc->zone->compact_cached_free_pfn) { + highest -= pageblock_nr_pages; + cc->zone->compact_cached_free_pfn = highest; + } + + cc->total_free_scanned += nr_scanned; + if (!page) + return cc->free_pfn; + + low_pfn = page_to_pfn(page); + fast_isolate_around(cc, low_pfn); + return low_pfn; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct compact_control *cc) +{ + struct zone *zone = cc->zone; + struct page *page; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long isolate_start_pfn; /* exact pfn we start at */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ + struct list_head *freelist = &cc->freepages; + unsigned int stride; + + /* Try a small search of the free lists for a candidate */ + fast_isolate_freepages(cc); + if (cc->nr_freepages) + goto splitmap; + + /* + * Initialise the free scanner. The starting point is where we last + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. For looping we also need + * this pfn aligned down to the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * zone which ends in the middle of a pageblock. + * The low boundary is the end of the pageblock the migration scanner + * is using. + */ + isolate_start_pfn = cc->free_pfn; + block_start_pfn = pageblock_start_pfn(isolate_start_pfn); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); + low_pfn = pageblock_end_pfn(cc->migrate_pfn); + stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + for (; block_start_pfn >= low_pfn; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages, + isolate_start_pfn = block_start_pfn) { + unsigned long nr_isolated; + + /* + * This can iterate a massively long zone without finding any + * suitable migration targets, so periodically check resched. + */ + if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages))) + cond_resched(); + + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); + if (!page) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(cc, page)) + continue; + + /* If isolation recently failed, do not retry */ + if (!isolation_suitable(cc, page)) + continue; + + /* Found a block suitable for isolating free pages from. */ + nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, stride, false); + + /* Update the skip hint if the full pageblock was scanned */ + if (isolate_start_pfn == block_end_pfn) + update_pageblock_skip(cc, page, block_start_pfn); + + /* Are enough freepages isolated? */ + if (cc->nr_freepages >= cc->nr_migratepages) { + if (isolate_start_pfn >= block_end_pfn) { + /* + * Restart at previous pageblock if more + * freepages can be isolated next time. + */ + isolate_start_pfn = + block_start_pfn - pageblock_nr_pages; + } + break; + } else if (isolate_start_pfn < block_end_pfn) { + /* + * If isolation failed early, do not continue + * needlessly. + */ + break; + } + + /* Adjust stride depending on isolation */ + if (nr_isolated) { + stride = 1; + continue; + } + stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1); + } + + /* + * Record where the free scanner will restart next time. Either we + * broke from the loop and set isolate_start_pfn based on the last + * call to isolate_freepages_block(), or we met the migration scanner + * and the loop terminated due to isolate_start_pfn < low_pfn + */ + cc->free_pfn = isolate_start_pfn; + +splitmap: + /* __isolate_free_page() does not map the pages */ + split_map_pages(freelist); +} + +/* + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data) +{ + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + + if (list_empty(&cc->freepages)) { + isolate_freepages(cc); + + if (list_empty(&cc->freepages)) + return NULL; + } + + freepage = list_entry(cc->freepages.next, struct page, lru); + list_del(&freepage->lru); + cc->nr_freepages--; + + return freepage; +} + +/* + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. + */ +static void compaction_free(struct page *page, unsigned long data) +{ + struct compact_control *cc = (struct compact_control *)data; + + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; +} + +/* possible outcome of isolate_migratepages */ +typedef enum { + ISOLATE_ABORT, /* Abort compaction now */ + ISOLATE_NONE, /* No pages isolated, continue scanning */ + ISOLATE_SUCCESS, /* Pages isolated, migrate */ +} isolate_migrate_t; + +/* + * Allow userspace to control policy on scanning the unevictable LRU for + * compactable pages. + */ +int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; + +static inline void +update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) +{ + if (cc->fast_start_pfn == ULONG_MAX) + return; + + if (!cc->fast_start_pfn) + cc->fast_start_pfn = pfn; + + cc->fast_start_pfn = min(cc->fast_start_pfn, pfn); +} + +static inline unsigned long +reinit_migrate_pfn(struct compact_control *cc) +{ + if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX) + return cc->migrate_pfn; + + cc->migrate_pfn = cc->fast_start_pfn; + cc->fast_start_pfn = ULONG_MAX; + + return cc->migrate_pfn; +} + +/* + * Briefly search the free lists for a migration source that already has + * some free pages to reduce the number of pages that need migration + * before a pageblock is free. + */ +static unsigned long fast_find_migrateblock(struct compact_control *cc) +{ + unsigned int limit = freelist_scan_limit(cc); + unsigned int nr_scanned = 0; + unsigned long distance; + unsigned long pfn = cc->migrate_pfn; + unsigned long high_pfn; + int order; + bool found_block = false; + + /* Skip hints are relied on to avoid repeats on the fast search */ + if (cc->ignore_skip_hint) + return pfn; + + /* + * If the migrate_pfn is not at the start of a zone or the start + * of a pageblock then assume this is a continuation of a previous + * scan restarted due to COMPACT_CLUSTER_MAX. + */ + if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) + return pfn; + + /* + * For smaller orders, just linearly scan as the number of pages + * to migrate should be relatively small and does not necessarily + * justify freeing up a large block for a small allocation. + */ + if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) + return pfn; + + /* + * Only allow kcompactd and direct requests for movable pages to + * quickly clear out a MOVABLE pageblock for allocation. This + * reduces the risk that a large movable pageblock is freed for + * an unmovable/reclaimable small allocation. + */ + if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) + return pfn; + + /* + * When starting the migration scanner, pick any pageblock within the + * first half of the search space. Otherwise try and pick a pageblock + * within the first eighth to reduce the chances that a migration + * target later becomes a source. + */ + distance = (cc->free_pfn - cc->migrate_pfn) >> 1; + if (cc->migrate_pfn != cc->zone->zone_start_pfn) + distance >>= 2; + high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); + + for (order = cc->order - 1; + order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit; + order--) { + struct free_area *area = &cc->zone->free_area[order]; + struct list_head *freelist; + unsigned long flags; + struct page *freepage; + + if (!area->nr_free) + continue; + + spin_lock_irqsave(&cc->zone->lock, flags); + freelist = &area->free_list[MIGRATE_MOVABLE]; + list_for_each_entry(freepage, freelist, lru) { + unsigned long free_pfn; + + if (nr_scanned++ >= limit) { + move_freelist_tail(freelist, freepage); + break; + } + + free_pfn = page_to_pfn(freepage); + if (free_pfn < high_pfn) { + /* + * Avoid if skipped recently. Ideally it would + * move to the tail but even safe iteration of + * the list assumes an entry is deleted, not + * reordered. + */ + if (get_pageblock_skip(freepage)) + continue; + + /* Reorder to so a future search skips recent pages */ + move_freelist_tail(freelist, freepage); + + update_fast_start_pfn(cc, free_pfn); + pfn = pageblock_start_pfn(free_pfn); + if (pfn < cc->zone->zone_start_pfn) + pfn = cc->zone->zone_start_pfn; + cc->fast_search_fail = 0; + found_block = true; + set_pageblock_skip(freepage); + break; + } + } + spin_unlock_irqrestore(&cc->zone->lock, flags); + } + + cc->total_migrate_scanned += nr_scanned; + + /* + * If fast scanning failed then use a cached entry for a page block + * that had free pages as the basis for starting a linear scan. + */ + if (!found_block) { + cc->fast_search_fail++; + pfn = reinit_migrate_pfn(cc); + } + return pfn; +} + +/* + * Isolate all pages that can be migrated from the first suitable block, + * starting at the block pointed to by the migrate scanner pfn within + * compact_control. + */ +static isolate_migrate_t isolate_migratepages(struct compact_control *cc) +{ + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; + struct page *page; + const isolate_mode_t isolate_mode = + (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | + (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); + bool fast_find_block; + + /* + * Start at where we last stopped, or beginning of the zone as + * initialized by compact_zone(). The first failure will use + * the lowest PFN as the starting point for linear scanning. + */ + low_pfn = fast_find_migrateblock(cc); + block_start_pfn = pageblock_start_pfn(low_pfn); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; + + /* + * fast_find_migrateblock marks a pageblock skipped so to avoid + * the isolation_suitable check below, check whether the fast + * search was successful. + */ + fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; + + /* Only scan within a pageblock boundary */ + block_end_pfn = pageblock_end_pfn(low_pfn); + + /* + * Iterate over whole pageblocks until we find the first suitable. + * Do not cross the free scanner. + */ + for (; block_end_pfn <= cc->free_pfn; + fast_find_block = false, + cc->migrate_pfn = low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + /* + * This can potentially iterate a massively long zone with + * many pageblocks unsuitable, so periodically check if we + * need to schedule. + */ + if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages))) + cond_resched(); + + page = pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone); + if (!page) + continue; + + /* + * If isolation recently failed, do not retry. Only check the + * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock + * to be visited multiple times. Assume skip was checked + * before making it "skip" so other compaction instances do + * not scan the same block. + */ + if (pageblock_aligned(low_pfn) && + !fast_find_block && !isolation_suitable(cc, page)) + continue; + + /* + * For async direct compaction, only scan the pageblocks of the + * same migratetype without huge pages. Async direct compaction + * is optimistic to see if the minimum amount of work satisfies + * the allocation. The cached PFN is updated as it's possible + * that all remaining blocks between source and target are + * unsuitable and the compaction scanners fail to meet. + */ + if (!suitable_migration_source(cc, page)) { + update_cached_migrate(cc, block_end_pfn); + continue; + } + + /* Perform the isolation */ + if (isolate_migratepages_block(cc, low_pfn, block_end_pfn, + isolate_mode)) + return ISOLATE_ABORT; + + /* + * Either we isolated something and proceed with migration. Or + * we failed and compact_zone should decide if we should + * continue or not. + */ + break; + } + + return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; +} + +/* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + +/* + * Determine whether kswapd is (or recently was!) running on this node. + * + * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't + * zero it. + */ +static bool kswapd_is_running(pg_data_t *pgdat) +{ + bool running; + + pgdat_kswapd_lock(pgdat); + running = pgdat->kswapd && task_is_running(pgdat->kswapd); + pgdat_kswapd_unlock(pgdat); + + return running; +} + +/* + * A zone's fragmentation score is the external fragmentation wrt to the + * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100]. + */ +static unsigned int fragmentation_score_zone(struct zone *zone) +{ + return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); +} + +/* + * A weighted zone's fragmentation score is the external fragmentation + * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It + * returns a value in the range [0, 100]. + * + * The scaling factor ensures that proactive compaction focuses on larger + * zones like ZONE_NORMAL, rather than smaller, specialized zones like + * ZONE_DMA32. For smaller zones, the score value remains close to zero, + * and thus never exceeds the high threshold for proactive compaction. + */ +static unsigned int fragmentation_score_zone_weighted(struct zone *zone) +{ + unsigned long score; + + score = zone->present_pages * fragmentation_score_zone(zone); + return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); +} + +/* + * The per-node proactive (background) compaction process is started by its + * corresponding kcompactd thread when the node's fragmentation score + * exceeds the high threshold. The compaction process remains active till + * the node's score falls below the low threshold, or one of the back-off + * conditions is met. + */ +static unsigned int fragmentation_score_node(pg_data_t *pgdat) +{ + unsigned int score = 0; + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone; + + zone = &pgdat->node_zones[zoneid]; + score += fragmentation_score_zone_weighted(zone); + } + + return score; +} + +static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +{ + unsigned int wmark_low; + + /* + * Cap the low watermark to avoid excessive compaction + * activity in case a user sets the proactiveness tunable + * close to 100 (maximum). + */ + wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); + return low ? wmark_low : min(wmark_low + 10, 100U); +} + +static bool should_proactive_compact_node(pg_data_t *pgdat) +{ + int wmark_high; + + if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) + return false; + + wmark_high = fragmentation_score_wmark(pgdat, false); + return fragmentation_score_node(pgdat) > wmark_high; +} + +static enum compact_result __compact_finished(struct compact_control *cc) +{ + unsigned int order; + const int migratetype = cc->migratetype; + int ret; + + /* Compaction run completes if the migrate and free scanner meet */ + if (compact_scanners_met(cc)) { + /* Let the next compaction start anew. */ + reset_cached_positions(cc->zone); + + /* + * Mark that the PG_migrate_skip information should be cleared + * by kswapd when it goes to sleep. kcompactd does not set the + * flag itself as the decision to be clear should be directly + * based on an allocation request. + */ + if (cc->direct_compaction) + cc->zone->compact_blockskip_flush = true; + + if (cc->whole_zone) + return COMPACT_COMPLETE; + else + return COMPACT_PARTIAL_SKIPPED; + } + + if (cc->proactive_compaction) { + int score, wmark_low; + pg_data_t *pgdat; + + pgdat = cc->zone->zone_pgdat; + if (kswapd_is_running(pgdat)) + return COMPACT_PARTIAL_SKIPPED; + + score = fragmentation_score_zone(cc->zone); + wmark_low = fragmentation_score_wmark(pgdat, true); + + if (score > wmark_low) + ret = COMPACT_CONTINUE; + else + ret = COMPACT_SUCCESS; + + goto out; + } + + if (is_via_compact_memory(cc->order)) + return COMPACT_CONTINUE; + + /* + * Always finish scanning a pageblock to reduce the possibility of + * fallbacks in the future. This is particularly important when + * migration source is unmovable/reclaimable but it's not worth + * special casing. + */ + if (!pageblock_aligned(cc->migrate_pfn)) + return COMPACT_CONTINUE; + + /* Direct compactor: Is a suitable page free? */ + ret = COMPACT_NO_SUITABLE_PAGE; + for (order = cc->order; order < MAX_ORDER; order++) { + struct free_area *area = &cc->zone->free_area[order]; + bool can_steal; + + /* Job done if page is free of the right migratetype */ + if (!free_area_empty(area, migratetype)) + return COMPACT_SUCCESS; + +#ifdef CONFIG_CMA + /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ + if (migratetype == MIGRATE_MOVABLE && + !free_area_empty(area, MIGRATE_CMA)) + return COMPACT_SUCCESS; +#endif + /* + * Job done if allocation would steal freepages from + * other migratetype buddy lists. + */ + if (find_suitable_fallback(area, order, migratetype, + true, &can_steal) != -1) + /* + * Movable pages are OK in any pageblock. If we are + * stealing for a non-movable allocation, make sure + * we finish compacting the current pageblock first + * (which is assured by the above migrate_pfn align + * check) so it is as free as possible and we won't + * have to steal another one soon. + */ + return COMPACT_SUCCESS; + } + +out: + if (cc->contended || fatal_signal_pending(current)) + ret = COMPACT_CONTENDED; + + return ret; +} + +static enum compact_result compact_finished(struct compact_control *cc) +{ + int ret; + + ret = __compact_finished(cc); + trace_mm_compaction_finished(cc->zone, cc->order, ret); + if (ret == COMPACT_NO_SUITABLE_PAGE) + ret = COMPACT_CONTINUE; + + return ret; +} + +static enum compact_result __compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int highest_zoneidx, + unsigned long wmark_target) +{ + unsigned long watermark; + + if (is_via_compact_memory(order)) + return COMPACT_CONTINUE; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + /* + * If watermarks for high-order allocation are already met, there + * should be no need for compaction at all. + */ + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, + alloc_flags)) + return COMPACT_SUCCESS; + + /* + * Watermarks for order-0 must be met for compaction to be able to + * isolate free pages for migration targets. This means that the + * watermark and alloc_flags have to match, or be more pessimistic than + * the check in __isolate_free_page(). We don't use the direct + * compactor's alloc_flags, as they are not relevant for freepage + * isolation. We however do use the direct compactor's highest_zoneidx + * to skip over zones where lowmem reserves would prevent allocation + * even if compaction succeeds. + * For costly orders, we require low watermark instead of min for + * compaction to proceed to increase its chances. + * ALLOC_CMA is used, as pages in CMA pageblocks are considered + * suitable migration targets + */ + watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? + low_wmark_pages(zone) : min_wmark_pages(zone); + watermark += compact_gap(order); + if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx, + ALLOC_CMA, wmark_target)) + return COMPACT_SKIPPED; + + return COMPACT_CONTINUE; +} + +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +enum compact_result compaction_suitable(struct zone *zone, int order, + unsigned int alloc_flags, + int highest_zoneidx) +{ + enum compact_result ret; + int fragindex; + + ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx, + zone_page_state(zone, NR_FREE_PAGES)); + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. Also + * ignore fragindex for non-costly orders where the alternative to + * a successful reclaim/compaction is OOM. Fragindex and the + * vm.extfrag_threshold sysctl is meant as a heuristic to prevent + * excessive compaction for costly orders, but it should not be at the + * expense of system stability. + */ + if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) { + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + ret = COMPACT_NOT_SUITABLE_ZONE; + } + + trace_mm_compaction_suitable(zone, order, ret); + if (ret == COMPACT_NOT_SUITABLE_ZONE) + ret = COMPACT_SKIPPED; + + return ret; +} + +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags) +{ + struct zone *zone; + struct zoneref *z; + + /* + * Make sure at least one zone would pass __compaction_suitable if we continue + * retrying the reclaim. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { + unsigned long available; + enum compact_result compact_result; + + /* + * Do not consider all the reclaimable memory because we do not + * want to trash just for a single high order allocation which + * is even not guaranteed to appear even if __compaction_suitable + * is happy about the watermark check. + */ + available = zone_reclaimable_pages(zone) / order; + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + compact_result = __compaction_suitable(zone, order, alloc_flags, + ac->highest_zoneidx, available); + if (compact_result == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static enum compact_result +compact_zone(struct compact_control *cc, struct capture_control *capc) +{ + enum compact_result ret; + unsigned long start_pfn = cc->zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(cc->zone); + unsigned long last_migrated_pfn; + const bool sync = cc->mode != MIGRATE_ASYNC; + bool update_cached; + unsigned int nr_succeeded = 0; + + /* + * These counters track activities during zone compaction. Initialize + * them before compacting a new zone. + */ + cc->total_migrate_scanned = 0; + cc->total_free_scanned = 0; + cc->nr_migratepages = 0; + cc->nr_freepages = 0; + INIT_LIST_HEAD(&cc->freepages); + INIT_LIST_HEAD(&cc->migratepages); + + cc->migratetype = gfp_migratetype(cc->gfp_mask); + ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, + cc->highest_zoneidx); + /* Compaction is likely to fail */ + if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) + return ret; + + /* huh, compaction_suitable is returning something unexpected */ + VM_BUG_ON(ret != COMPACT_CONTINUE); + + /* + * Clear pageblock skip if there were failures recently and compaction + * is about to be retried after being deferred. + */ + if (compaction_restarting(cc->zone, cc->order)) + __reset_isolation_suitable(cc->zone); + + /* + * Setup to move all movable pages to the end of the zone. Used cached + * information on where the scanners should start (unless we explicitly + * want to compact the whole zone), but check that it is initialised + * by ensuring the values are within zone boundaries. + */ + cc->fast_start_pfn = 0; + if (cc->whole_zone) { + cc->migrate_pfn = start_pfn; + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + } else { + cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync]; + cc->free_pfn = cc->zone->compact_cached_free_pfn; + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); + cc->zone->compact_cached_free_pfn = cc->free_pfn; + } + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { + cc->migrate_pfn = start_pfn; + cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; + } + + if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) + cc->whole_zone = true; + } + + last_migrated_pfn = 0; + + /* + * Migrate has separate cached PFNs for ASYNC and SYNC* migration on + * the basis that some migrations will fail in ASYNC mode. However, + * if the cached PFNs match and pageblocks are skipped due to having + * no isolation candidates, then the sync state does not matter. + * Until a pageblock with isolation candidates is found, keep the + * cached PFNs in sync to avoid revisiting the same blocks. + */ + update_cached = !sync && + cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; + + trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync); + + /* lru_add_drain_all could be expensive with involving other CPUs */ + lru_add_drain(); + + while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { + int err; + unsigned long iteration_start_pfn = cc->migrate_pfn; + + /* + * Avoid multiple rescans which can happen if a page cannot be + * isolated (dirty/writeback in async mode) or if the migrated + * pages are being allocated before the pageblock is cleared. + * The first rescan will capture the entire pageblock for + * migration. If it fails, it'll be marked skip and scanning + * will proceed as normal. + */ + cc->rescan = false; + if (pageblock_start_pfn(last_migrated_pfn) == + pageblock_start_pfn(iteration_start_pfn)) { + cc->rescan = true; + } + + switch (isolate_migratepages(cc)) { + case ISOLATE_ABORT: + ret = COMPACT_CONTENDED; + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + goto out; + case ISOLATE_NONE: + if (update_cached) { + cc->zone->compact_cached_migrate_pfn[1] = + cc->zone->compact_cached_migrate_pfn[0]; + } + + /* + * We haven't isolated and migrated anything, but + * there might still be unflushed migrations from + * previous cc->order aligned block. + */ + goto check_drain; + case ISOLATE_SUCCESS: + update_cached = false; + last_migrated_pfn = iteration_start_pfn; + } + + err = migrate_pages(&cc->migratepages, compaction_alloc, + compaction_free, (unsigned long)cc, cc->mode, + MR_COMPACTION, &nr_succeeded); + + trace_mm_compaction_migratepages(cc, nr_succeeded); + + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; + if (err) { + putback_movable_pages(&cc->migratepages); + /* + * migrate_pages() may return -ENOMEM when scanners meet + * and we want compact_finished() to detect it + */ + if (err == -ENOMEM && !compact_scanners_met(cc)) { + ret = COMPACT_CONTENDED; + goto out; + } + /* + * We failed to migrate at least one page in the current + * order-aligned block, so skip the rest of it. + */ + if (cc->direct_compaction && + (cc->mode == MIGRATE_ASYNC)) { + cc->migrate_pfn = block_end_pfn( + cc->migrate_pfn - 1, cc->order); + /* Draining pcplists is useless in this case */ + last_migrated_pfn = 0; + } + } + +check_drain: + /* + * Has the migration scanner moved away from the previous + * cc->order aligned block where we migrated from? If yes, + * flush the pages that were freed, so that they can merge and + * compact_finished() can detect immediately if allocation + * would succeed. + */ + if (cc->order > 0 && last_migrated_pfn) { + unsigned long current_block_start = + block_start_pfn(cc->migrate_pfn, cc->order); + + if (last_migrated_pfn < current_block_start) { + lru_add_drain_cpu_zone(cc->zone); + /* No more flushing until we migrate again */ + last_migrated_pfn = 0; + } + } + + /* Stop if a page has been captured */ + if (capc && capc->page) { + ret = COMPACT_SUCCESS; + break; + } + } + +out: + /* + * Release free pages and update where the free scanner should restart, + * so we don't leave any returned pages behind in the next attempt. + */ + if (cc->nr_freepages > 0) { + unsigned long free_pfn = release_freepages(&cc->freepages); + + cc->nr_freepages = 0; + VM_BUG_ON(free_pfn == 0); + /* The cached pfn is always the first in a pageblock */ + free_pfn = pageblock_start_pfn(free_pfn); + /* + * Only go back, not forward. The cached pfn might have been + * already reset to zone end in compact_finished() + */ + if (free_pfn > cc->zone->compact_cached_free_pfn) + cc->zone->compact_cached_free_pfn = free_pfn; + } + + count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); + count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned); + + trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret); + + return ret; +} + +static enum compact_result compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum compact_priority prio, + unsigned int alloc_flags, int highest_zoneidx, + struct page **capture) +{ + enum compact_result ret; + struct compact_control cc = { + .order = order, + .search_order = order, + .gfp_mask = gfp_mask, + .zone = zone, + .mode = (prio == COMPACT_PRIO_ASYNC) ? + MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT, + .alloc_flags = alloc_flags, + .highest_zoneidx = highest_zoneidx, + .direct_compaction = true, + .whole_zone = (prio == MIN_COMPACT_PRIORITY), + .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), + .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) + }; + struct capture_control capc = { + .cc = &cc, + .page = NULL, + }; + + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); + + ret = compact_zone(&cc, &capc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + *capture = READ_ONCE(capc.page); + /* + * Technically, it is also possible that compaction is skipped but + * the page is still captured out of luck(IRQ came and freed the page). + * Returning COMPACT_SUCCESS in such cases helps in properly accounting + * the COMPACT[STALL|FAIL] when compaction is skipped. + */ + if (*capture) + ret = COMPACT_SUCCESS; + + return ret; +} + +int sysctl_extfrag_threshold = 500; + +/** + * try_to_compact_pages - Direct compact to satisfy a high-order allocation + * @gfp_mask: The GFP mask of the current allocation + * @order: The order of the current allocation + * @alloc_flags: The allocation flags of the current allocation + * @ac: The context of current allocation + * @prio: Determines how hard direct compaction should try to succeed + * @capture: Pointer to free page created by compaction will be stored here + * + * This is the main entry point for direct page compaction. + */ +enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + enum compact_priority prio, struct page **capture) +{ + int may_perform_io = (__force int)(gfp_mask & __GFP_IO); + struct zoneref *z; + struct zone *zone; + enum compact_result rc = COMPACT_SKIPPED; + + /* + * Check if the GFP flags allow compaction - GFP_NOIO is really + * tricky context because the migration might require IO + */ + if (!may_perform_io) + return COMPACT_SKIPPED; + + trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); + + /* Compact each zone in the list */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { + enum compact_result status; + + if (prio > MIN_COMPACT_PRIORITY + && compaction_deferred(zone, order)) { + rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); + continue; + } + + status = compact_zone_order(zone, order, gfp_mask, prio, + alloc_flags, ac->highest_zoneidx, capture); + rc = max(status, rc); + + /* The allocation should succeed, stop compacting */ + if (status == COMPACT_SUCCESS) { + /* + * We think the allocation will succeed in this zone, + * but it is not certain, hence the false. The caller + * will repeat this with true if allocation indeed + * succeeds in this zone. + */ + compaction_defer_reset(zone, order, false); + + break; + } + + if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) + /* + * We think that allocation won't succeed in this zone + * so we defer compaction there. If it ends up + * succeeding after all, it will be reset. + */ + defer_compaction(zone, order); + + /* + * We might have stopped compacting due to need_resched() in + * async compaction, or due to a fatal signal detected. In that + * case do not try further zones + */ + if ((prio == COMPACT_PRIO_ASYNC && need_resched()) + || fatal_signal_pending(current)) + break; + } + + return rc; +} + +/* + * Compact all zones within a node till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable). + * + * It is possible that the function returns before reaching score targets + * due to various back-off conditions, such as, contention on per-node or + * per-zone locks. + */ +static void proactive_compact_node(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + .whole_zone = true, + .gfp_mask = GFP_KERNEL, + .proactive_compaction = true, + }; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + + compact_zone(&cc, NULL); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } +} + +/* Compact all zones within a node */ +static void compact_node(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .whole_zone = true, + .gfp_mask = GFP_KERNEL, + }; + + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + + compact_zone(&cc, NULL); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } +} + +/* Compact all nodes in the system */ +static void compact_nodes(void) +{ + int nid; + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for_each_online_node(nid) + compact_node(nid); +} + +/* + * Tunable for proactive compaction. It determines how + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ +unsigned int __read_mostly sysctl_compaction_proactiveness = 20; + +int compaction_proactiveness_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc, nid; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write && sysctl_compaction_proactiveness) { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->proactive_compact_trigger) + continue; + + pgdat->proactive_compact_trigger = true; + wake_up_interruptible(&pgdat->kcompactd_wait); + } + } + + return 0; +} + +/* + * This is the entry point for compacting all nodes via + * /proc/sys/vm/compact_memory + */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + if (write) + compact_nodes(); + + return 0; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int nid = dev->id; + + if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + compact_node(nid); + } + + return count; +} +static DEVICE_ATTR_WO(compact); + +int compaction_register_node(struct node *node) +{ + return device_create_file(&node->dev, &dev_attr_compact); +} + +void compaction_unregister_node(struct node *node) +{ + return device_remove_file(&node->dev, &dev_attr_compact); +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ + +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0 || kthread_should_stop() || + pgdat->proactive_compact_trigger; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; + + for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + highest_zoneidx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .search_order = pgdat->kcompactd_max_order, + .highest_zoneidx = pgdat->kcompactd_highest_zoneidx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = false, + .gfp_mask = GFP_KERNEL, + }; + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.highest_zoneidx); + count_compact_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + if (kthread_should_stop()) + return; + + cc.zone = zone; + status = compact_zone(&cc, NULL); + + if (status == COMPACT_SUCCESS) { + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { + /* + * Buddy pages may become stranded on pcps that could + * otherwise coalesce on the zone's free area for + * order >= cc.order. This is ratelimited by the + * upcoming deferral. + */ + drain_all_pages(zone); + + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + count_compact_events(KCOMPACTD_MIGRATE_SCANNED, + cc.total_migrate_scanned); + count_compact_events(KCOMPACTD_FREE_SCANNED, + cc.total_free_scanned); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/highest_zoneidx in case it was higher/tighter + * than our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx) + pgdat->kcompactd_highest_zoneidx = highest_zoneidx; + + /* + * Pairs with implicit barrier in wait_event_freezable() + * such that wakeups are not missed. + */ + if (!wq_has_sleeper(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + highest_zoneidx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t *)p; + struct task_struct *tsk = current; + long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); + long timeout = default_timeout; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + unsigned long pflags; + + /* + * Avoid the unnecessary wakeup for proactive compaction + * when it is disabled. + */ + if (!sysctl_compaction_proactiveness) + timeout = MAX_SCHEDULE_TIMEOUT; + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + if (wait_event_freezable_timeout(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat), timeout) && + !pgdat->proactive_compact_trigger) { + + psi_memstall_enter(&pflags); + kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); + /* + * Reset the timeout value. The defer timeout from + * proactive compaction is lost here but that is fine + * as the condition of the zone changing substantionally + * then carrying on with the previous defer interval is + * not useful. + */ + timeout = default_timeout; + continue; + } + + /* + * Start the proactive work with default timeout. Based + * on the fragmentation score, this timeout is updated. + */ + timeout = default_timeout; + if (should_proactive_compact_node(pgdat)) { + unsigned int prev_score, score; + + prev_score = fragmentation_score_node(pgdat); + proactive_compact_node(pgdat); + score = fragmentation_score_node(pgdat); + /* + * Defer proactive compaction if the fragmentation + * score did not go down i.e. no progress made. + */ + if (unlikely(score >= prev_score)) + timeout = + default_timeout << COMPACT_MAX_DEFER_SHIFT; + } + if (unlikely(pgdat->proactive_compact_trigger)) + pgdat->proactive_compact_trigger = false; + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +void kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->kcompactd) + return; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + pgdat->kcompactd = NULL; + } +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * be holding mem_hotplug_begin/done(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int kcompactd_cpu_online(unsigned int cpu) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + if (pgdat->kcompactd) + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + return 0; +} + +static int __init kcompactd_init(void) +{ + int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "mm/compaction:online", + kcompactd_cpu_online, NULL); + if (ret < 0) { + pr_err("kcompactd: failed to register hotplug callbacks.\n"); + return ret; + } + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + return 0; +} +subsys_initcall(kcompactd_init) + +#endif /* CONFIG_COMPACTION */ diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig new file mode 100644 index 000000000..7821fcb3f --- /dev/null +++ b/mm/damon/Kconfig @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: GPL-2.0-only + +menu "Data Access Monitoring" + +config DAMON + bool "DAMON: Data Access Monitoring Framework" + help + This builds a framework that allows kernel subsystems to monitor + access frequency of each memory region. The information can be useful + for performance-centric DRAM level memory management. + + See https://damonitor.github.io/doc/html/latest-damon/index.html for + more information. + +config DAMON_KUNIT_TEST + bool "Test for damon" if !KUNIT_ALL_TESTS + depends on DAMON && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_VADDR + bool "Data access monitoring operations for virtual address spaces" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring operations for DAMON + that work for virtual address spaces. + +config DAMON_PADDR + bool "Data access monitoring operations for the physical address space" + depends on DAMON && MMU + select PAGE_IDLE_FLAG + help + This builds the default data access monitoring operations for DAMON + that works for the physical address space. + +config DAMON_VADDR_KUNIT_TEST + bool "Test for DAMON operations" if !KUNIT_ALL_TESTS + depends on DAMON_VADDR && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON virtual addresses operations Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_SYSFS + bool "DAMON sysfs interface" + depends on DAMON && SYSFS + help + This builds the sysfs interface for DAMON. The user space can use + the interface for arbitrary data access monitoring. + +config DAMON_DBGFS + bool "DAMON debugfs interface" + depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS + help + This builds the debugfs interface for DAMON. The user space admins + can use the interface for arbitrary data access monitoring. + + If unsure, say N. + + This will be removed after >5.15.y LTS kernel is released, so users + should move to the sysfs interface (DAMON_SYSFS). + +config DAMON_DBGFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_DBGFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON debugfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + +config DAMON_RECLAIM + bool "Build DAMON-based reclaim (DAMON_RECLAIM)" + depends on DAMON_PADDR + help + This builds the DAMON-based reclamation subsystem. It finds pages + that not accessed for a long time (cold) using DAMON and reclaim + those. + + This is suggested to be used as a proactive and lightweight + reclamation under light memory pressure, while the traditional page + scanning-based reclamation is used for heavy pressure. + +config DAMON_LRU_SORT + bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)" + depends on DAMON_PADDR + help + This builds the DAMON-based LRU-lists sorting subsystem. It tries to + protect frequently accessed (hot) pages while rarely accessed (cold) + pages reclaimed first under memory pressure. + +endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile new file mode 100644 index 000000000..3e6b8ad73 --- /dev/null +++ b/mm/damon/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := core.o +obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o +obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs.o +obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o +obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h new file mode 100644 index 000000000..3db9b7368 --- /dev/null +++ b/mm/damon/core-test.h @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_KUNIT_TEST + +#ifndef _DAMON_CORE_TEST_H +#define _DAMON_CORE_TEST_H + +#include + +static void damon_test_regions(struct kunit *test) +{ + struct damon_region *r; + struct damon_target *t; + + r = damon_new_region(1, 2); + KUNIT_EXPECT_EQ(test, 1ul, r->ar.start); + KUNIT_EXPECT_EQ(test, 2ul, r->ar.end); + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + + t = damon_new_target(); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); + + damon_del_region(r, t); + KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); + + damon_free_target(t); +} + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static void damon_test_target(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + + t = damon_new_target(); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_add_target(c, t); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(c)); + + damon_destroy_target(t); + KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c)); + + damon_destroy_ctx(c); +} + +/* + * Test kdamond_reset_aggregated() + * + * DAMON checks access to each region and aggregates this information as the + * access frequency of each region. In detail, it increases '->nr_accesses' of + * regions that an access has confirmed. 'kdamond_reset_aggregated()' flushes + * the aggregated information ('->nr_accesses' of each regions) to the result + * buffer. As a result of the flushing, the '->nr_accesses' of regions are + * initialized to zero. + */ +static void damon_test_aggregate(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} }; + unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} }; + unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} }; + struct damon_target *t; + struct damon_region *r; + int it, ir; + + for (it = 0; it < 3; it++) { + t = damon_new_target(); + damon_add_target(ctx, t); + } + + it = 0; + damon_for_each_target(t, ctx) { + for (ir = 0; ir < 3; ir++) { + r = damon_new_region(saddr[it][ir], eaddr[it][ir]); + r->nr_accesses = accesses[it][ir]; + damon_add_region(r, t); + } + it++; + } + kdamond_reset_aggregated(ctx); + it = 0; + damon_for_each_target(t, ctx) { + ir = 0; + /* '->nr_accesses' should be zeroed */ + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses); + ir++; + } + /* regions should be preserved */ + KUNIT_EXPECT_EQ(test, 3, ir); + it++; + } + /* targets also should be preserved */ + KUNIT_EXPECT_EQ(test, 3, it); + + damon_destroy_ctx(ctx); +} + +static void damon_test_split_at(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(); + r = damon_new_region(0, 100); + damon_add_region(r, t); + damon_split_region_at(t, r, 25); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 25ul); + + r = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r->ar.start, 25ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 100ul); + + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_merge_two(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2, *r3; + int i; + + t = damon_new_target(); + r = damon_new_region(0, 100); + r->nr_accesses = 10; + damon_add_region(r, t); + r2 = damon_new_region(100, 300); + r2->nr_accesses = 20; + damon_add_region(r2, t); + + damon_merge_two_regions(t, r, r2); + KUNIT_EXPECT_EQ(test, r->ar.start, 0ul); + KUNIT_EXPECT_EQ(test, r->ar.end, 300ul); + KUNIT_EXPECT_EQ(test, r->nr_accesses, 16u); + + i = 0; + damon_for_each_region(r3, t) { + KUNIT_EXPECT_PTR_EQ(test, r, r3); + i++; + } + KUNIT_EXPECT_EQ(test, i, 1); + + damon_free_target(t); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +static void damon_test_merge_regions_of(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sa[] = {0, 100, 114, 122, 130, 156, 170, 184}; + unsigned long ea[] = {100, 112, 122, 130, 156, 170, 184, 230}; + unsigned int nrs[] = {0, 0, 10, 10, 20, 30, 1, 2}; + + unsigned long saddrs[] = {0, 114, 130, 156, 170}; + unsigned long eaddrs[] = {112, 130, 156, 170, 230}; + int i; + + t = damon_new_target(); + for (i = 0; i < ARRAY_SIZE(sa); i++) { + r = damon_new_region(sa[i], ea[i]); + r->nr_accesses = nrs[i]; + damon_add_region(r, t); + } + + damon_merge_regions_of(t, 9, 9999); + /* 0-112, 114-130, 130-156, 156-170 */ + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u); + for (i = 0; i < 5; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, saddrs[i]); + KUNIT_EXPECT_EQ(test, r->ar.end, eaddrs[i]); + } + damon_free_target(t); +} + +static void damon_test_split_regions_of(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_target *t; + struct damon_region *r; + + t = damon_new_target(); + r = damon_new_region(0, 22); + damon_add_region(r, t); + damon_split_regions_of(t, 2); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); + damon_free_target(t); + + t = damon_new_target(); + r = damon_new_region(0, 220); + damon_add_region(r, t); + damon_split_regions_of(t, 4); + KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); + damon_free_target(t); + damon_destroy_ctx(c); +} + +static void damon_test_ops_registration(struct kunit *test) +{ + struct damon_ctx *c = damon_new_ctx(); + struct damon_operations ops, bak; + + /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */ + KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0); + KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0); + + /* Double-registration is prohibited */ + ops.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + ops.id = DAMON_OPS_PADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + + /* Unknown ops id cannot be registered */ + KUNIT_EXPECT_EQ(test, damon_select_ops(c, NR_DAMON_OPS), -EINVAL); + + /* Registration should success after unregistration */ + mutex_lock(&damon_ops_lock); + bak = damon_registered_ops[DAMON_OPS_VADDR]; + damon_registered_ops[DAMON_OPS_VADDR] = (struct damon_operations){}; + mutex_unlock(&damon_ops_lock); + + ops.id = DAMON_OPS_VADDR; + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), 0); + + mutex_lock(&damon_ops_lock); + damon_registered_ops[DAMON_OPS_VADDR] = bak; + mutex_unlock(&damon_ops_lock); + + /* Check double-registration failure again */ + KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); +} + +static void damon_test_set_regions(struct kunit *test) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r1 = damon_new_region(4, 16); + struct damon_region *r2 = damon_new_region(24, 32); + struct damon_addr_range range = {.start = 8, .end = 28}; + unsigned long expects[] = {8, 16, 16, 24, 24, 28}; + int expect_idx = 0; + struct damon_region *r; + + damon_add_region(r1, t); + damon_add_region(r2, t); + damon_set_regions(t, &range, 1); + + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + } + damon_destroy_target(t); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_target), + KUNIT_CASE(damon_test_regions), + KUNIT_CASE(damon_test_aggregate), + KUNIT_CASE(damon_test_split_at), + KUNIT_CASE(damon_test_merge_two), + KUNIT_CASE(damon_test_merge_regions_of), + KUNIT_CASE(damon_test_split_regions_of), + KUNIT_CASE(damon_test_ops_registration), + KUNIT_CASE(damon_test_set_regions), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_CORE_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/core.c b/mm/damon/core.c new file mode 100644 index 000000000..5db9bec8a --- /dev/null +++ b/mm/damon/core.c @@ -0,0 +1,1310 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Data Access Monitor + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon: " fmt + +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_DAMON_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + +static DEFINE_MUTEX(damon_lock); +static int nr_running_ctxs; +static bool running_exclusive_ctxs; + +static DEFINE_MUTEX(damon_ops_lock); +static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; + +static struct kmem_cache *damon_region_cache __ro_after_init; + +/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ +static bool __damon_is_registered_ops(enum damon_ops_id id) +{ + struct damon_operations empty_ops = {}; + + if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops))) + return false; + return true; +} + +/** + * damon_is_registered_ops() - Check if a given damon_operations is registered. + * @id: Id of the damon_operations to check if registered. + * + * Return: true if the ops is set, false otherwise. + */ +bool damon_is_registered_ops(enum damon_ops_id id) +{ + bool registered; + + if (id >= NR_DAMON_OPS) + return false; + mutex_lock(&damon_ops_lock); + registered = __damon_is_registered_ops(id); + mutex_unlock(&damon_ops_lock); + return registered; +} + +/** + * damon_register_ops() - Register a monitoring operations set to DAMON. + * @ops: monitoring operations set to register. + * + * This function registers a monitoring operations set of valid &struct + * damon_operations->id so that others can find and use them later. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_register_ops(struct damon_operations *ops) +{ + int err = 0; + + if (ops->id >= NR_DAMON_OPS) + return -EINVAL; + mutex_lock(&damon_ops_lock); + /* Fail for already registered ops */ + if (__damon_is_registered_ops(ops->id)) { + err = -EINVAL; + goto out; + } + damon_registered_ops[ops->id] = *ops; +out: + mutex_unlock(&damon_ops_lock); + return err; +} + +/** + * damon_select_ops() - Select a monitoring operations to use with the context. + * @ctx: monitoring context to use the operations. + * @id: id of the registered monitoring operations to select. + * + * This function finds registered monitoring operations set of @id and make + * @ctx to use it. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id) +{ + int err = 0; + + if (id >= NR_DAMON_OPS) + return -EINVAL; + + mutex_lock(&damon_ops_lock); + if (!__damon_is_registered_ops(id)) + err = -EINVAL; + else + ctx->ops = damon_registered_ops[id]; + mutex_unlock(&damon_ops_lock); + return err; +} + +/* + * Construct a damon_region struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_region *damon_new_region(unsigned long start, unsigned long end) +{ + struct damon_region *region; + + region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); + if (!region) + return NULL; + + region->ar.start = start; + region->ar.end = end; + region->nr_accesses = 0; + INIT_LIST_HEAD(®ion->list); + + region->age = 0; + region->last_nr_accesses = 0; + + return region; +} + +void damon_add_region(struct damon_region *r, struct damon_target *t) +{ + list_add_tail(&r->list, &t->regions_list); + t->nr_regions++; +} + +static void damon_del_region(struct damon_region *r, struct damon_target *t) +{ + list_del(&r->list); + t->nr_regions--; +} + +static void damon_free_region(struct damon_region *r) +{ + kmem_cache_free(damon_region_cache, r); +} + +void damon_destroy_region(struct damon_region *r, struct damon_target *t) +{ + damon_del_region(r, t); + damon_free_region(r); +} + +/* + * Check whether a region is intersecting an address range + * + * Returns true if it is. + */ +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) +{ + return !(r->ar.end <= re->start || re->end <= r->ar.start); +} + +/* + * Fill holes in regions with new regions. + */ +static int damon_fill_regions_holes(struct damon_region *first, + struct damon_region *last, struct damon_target *t) +{ + struct damon_region *r = first; + + damon_for_each_region_from(r, t) { + struct damon_region *next, *newr; + + if (r == last) + break; + next = damon_next_region(r); + if (r->ar.end != next->ar.start) { + newr = damon_new_region(r->ar.end, next->ar.start); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, r, next, t); + } + } + return 0; +} + +/* + * damon_set_regions() - Set regions of a target for given address ranges. + * @t: the given target. + * @ranges: array of new monitoring target ranges. + * @nr_ranges: length of @ranges. + * + * This function adds new regions to, or modify existing regions of a + * monitoring target to fit in specific ranges. + * + * Return: 0 if success, or negative error code otherwise. + */ +int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, + unsigned int nr_ranges) +{ + struct damon_region *r, *next; + unsigned int i; + int err; + + /* Remove regions which are not in the new ranges */ + damon_for_each_region_safe(r, next, t) { + for (i = 0; i < nr_ranges; i++) { + if (damon_intersect(r, &ranges[i])) + break; + } + if (i == nr_ranges) + damon_destroy_region(r, t); + } + + r = damon_first_region(t); + /* Add new regions or resize existing regions to fit in the ranges */ + for (i = 0; i < nr_ranges; i++) { + struct damon_region *first = NULL, *last, *newr; + struct damon_addr_range *range; + + range = &ranges[i]; + /* Get the first/last regions intersecting with the range */ + damon_for_each_region_from(r, t) { + if (damon_intersect(r, range)) { + if (!first) + first = r; + last = r; + } + if (r->ar.start >= range->end) + break; + } + if (!first) { + /* no region intersects with this range */ + newr = damon_new_region( + ALIGN_DOWN(range->start, + DAMON_MIN_REGION), + ALIGN(range->end, DAMON_MIN_REGION)); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, damon_prev_region(r), r, t); + } else { + /* resize intersecting regions to fit in this range */ + first->ar.start = ALIGN_DOWN(range->start, + DAMON_MIN_REGION); + last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + + /* fill possible holes in the range */ + err = damon_fill_regions_holes(first, last, t); + if (err) + return err; + } + } + return 0; +} + +/* initialize private fields of damos_quota and return the pointer */ +static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +{ + quota->total_charged_sz = 0; + quota->total_charged_ns = 0; + quota->esz = 0; + quota->charged_sz = 0; + quota->charged_from = 0; + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return quota; +} + +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) +{ + struct damos *scheme; + + scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); + if (!scheme) + return NULL; + scheme->pattern = *pattern; + scheme->action = action; + scheme->stat = (struct damos_stat){}; + INIT_LIST_HEAD(&scheme->list); + + scheme->quota = *(damos_quota_init_priv(quota)); + + scheme->wmarks = *wmarks; + scheme->wmarks.activated = true; + + return scheme; +} + +void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) +{ + list_add_tail(&s->list, &ctx->schemes); +} + +static void damon_del_scheme(struct damos *s) +{ + list_del(&s->list); +} + +static void damon_free_scheme(struct damos *s) +{ + kfree(s); +} + +void damon_destroy_scheme(struct damos *s) +{ + damon_del_scheme(s); + damon_free_scheme(s); +} + +/* + * Construct a damon_target struct + * + * Returns the pointer to the new struct if success, or NULL otherwise + */ +struct damon_target *damon_new_target(void) +{ + struct damon_target *t; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->pid = NULL; + t->nr_regions = 0; + INIT_LIST_HEAD(&t->regions_list); + INIT_LIST_HEAD(&t->list); + + return t; +} + +void damon_add_target(struct damon_ctx *ctx, struct damon_target *t) +{ + list_add_tail(&t->list, &ctx->adaptive_targets); +} + +bool damon_targets_empty(struct damon_ctx *ctx) +{ + return list_empty(&ctx->adaptive_targets); +} + +static void damon_del_target(struct damon_target *t) +{ + list_del(&t->list); +} + +void damon_free_target(struct damon_target *t) +{ + struct damon_region *r, *next; + + damon_for_each_region_safe(r, next, t) + damon_free_region(r); + kfree(t); +} + +void damon_destroy_target(struct damon_target *t) +{ + damon_del_target(t); + damon_free_target(t); +} + +unsigned int damon_nr_regions(struct damon_target *t) +{ + return t->nr_regions; +} + +struct damon_ctx *damon_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + init_completion(&ctx->kdamond_started); + + ctx->attrs.sample_interval = 5 * 1000; + ctx->attrs.aggr_interval = 100 * 1000; + ctx->attrs.ops_update_interval = 60 * 1000 * 1000; + + ktime_get_coarse_ts64(&ctx->last_aggregation); + ctx->last_ops_update = ctx->last_aggregation; + + mutex_init(&ctx->kdamond_lock); + + ctx->attrs.min_nr_regions = 10; + ctx->attrs.max_nr_regions = 1000; + + INIT_LIST_HEAD(&ctx->adaptive_targets); + INIT_LIST_HEAD(&ctx->schemes); + + return ctx; +} + +static void damon_destroy_targets(struct damon_ctx *ctx) +{ + struct damon_target *t, *next_t; + + if (ctx->ops.cleanup) { + ctx->ops.cleanup(ctx); + return; + } + + damon_for_each_target_safe(t, next_t, ctx) + damon_destroy_target(t); +} + +void damon_destroy_ctx(struct damon_ctx *ctx) +{ + struct damos *s, *next_s; + + damon_destroy_targets(ctx); + + damon_for_each_scheme_safe(s, next_s, ctx) + damon_destroy_scheme(s); + + kfree(ctx); +} + +/** + * damon_set_attrs() - Set attributes for the monitoring. + * @ctx: monitoring context + * @attrs: monitoring attributes + * + * This function should not be called while the kdamond is running. + * Every time interval is in micro-seconds. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) +{ + if (attrs->min_nr_regions < 3) + return -EINVAL; + if (attrs->min_nr_regions > attrs->max_nr_regions) + return -EINVAL; + + ctx->attrs = *attrs; + return 0; +} + +/** + * damon_set_schemes() - Set data access monitoring based operation schemes. + * @ctx: monitoring context + * @schemes: array of the schemes + * @nr_schemes: number of entries in @schemes + * + * This function should not be called while the kdamond of the context is + * running. + */ +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, + ssize_t nr_schemes) +{ + struct damos *s, *next; + ssize_t i; + + damon_for_each_scheme_safe(s, next, ctx) + damon_destroy_scheme(s); + for (i = 0; i < nr_schemes; i++) + damon_add_scheme(ctx, schemes[i]); +} + +/** + * damon_nr_running_ctxs() - Return number of currently running contexts. + */ +int damon_nr_running_ctxs(void) +{ + int nr_ctxs; + + mutex_lock(&damon_lock); + nr_ctxs = nr_running_ctxs; + mutex_unlock(&damon_lock); + + return nr_ctxs; +} + +/* Returns the size upper limit for each monitoring region */ +static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + sz += damon_sz_region(r); + } + + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + return sz; +} + +static int kdamond_fn(void *data); + +/* + * __damon_start() - Starts monitoring with given context. + * @ctx: monitoring context + * + * This function should be called while damon_lock is hold. + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_start(struct damon_ctx *ctx) +{ + int err = -EBUSY; + + mutex_lock(&ctx->kdamond_lock); + if (!ctx->kdamond) { + err = 0; + reinit_completion(&ctx->kdamond_started); + ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d", + nr_running_ctxs); + if (IS_ERR(ctx->kdamond)) { + err = PTR_ERR(ctx->kdamond); + ctx->kdamond = NULL; + } else { + wait_for_completion(&ctx->kdamond_started); + } + } + mutex_unlock(&ctx->kdamond_lock); + + return err; +} + +/** + * damon_start() - Starts the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to start monitoring + * @nr_ctxs: size of @ctxs + * @exclusive: exclusiveness of this contexts group + * + * This function starts a group of monitoring threads for a group of monitoring + * contexts. One thread per each context is created and run in parallel. The + * caller should handle synchronization between the threads by itself. If + * @exclusive is true and a group of threads that created by other + * 'damon_start()' call is currently running, this function does nothing but + * returns -EBUSY. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive) +{ + int i; + int err = 0; + + mutex_lock(&damon_lock); + if ((exclusive && nr_running_ctxs) || + (!exclusive && running_exclusive_ctxs)) { + mutex_unlock(&damon_lock); + return -EBUSY; + } + + for (i = 0; i < nr_ctxs; i++) { + err = __damon_start(ctxs[i]); + if (err) + break; + nr_running_ctxs++; + } + if (exclusive && nr_running_ctxs) + running_exclusive_ctxs = true; + mutex_unlock(&damon_lock); + + return err; +} + +/* + * __damon_stop() - Stops monitoring of a given context. + * @ctx: monitoring context + * + * Return: 0 on success, negative error code otherwise. + */ +static int __damon_stop(struct damon_ctx *ctx) +{ + struct task_struct *tsk; + + mutex_lock(&ctx->kdamond_lock); + tsk = ctx->kdamond; + if (tsk) { + get_task_struct(tsk); + mutex_unlock(&ctx->kdamond_lock); + kthread_stop(tsk); + put_task_struct(tsk); + return 0; + } + mutex_unlock(&ctx->kdamond_lock); + + return -EPERM; +} + +/** + * damon_stop() - Stops the monitorings for a given group of contexts. + * @ctxs: an array of the pointers for contexts to stop monitoring + * @nr_ctxs: size of @ctxs + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) +{ + int i, err = 0; + + for (i = 0; i < nr_ctxs; i++) { + /* nr_running_ctxs is decremented in kdamond_fn */ + err = __damon_stop(ctxs[i]); + if (err) + break; + } + return err; +} + +/* + * damon_check_reset_time_interval() - Check if a time interval is elapsed. + * @baseline: the time to check whether the interval has elapsed since + * @interval: the time interval (microseconds) + * + * See whether the given time interval has passed since the given baseline + * time. If so, it also updates the baseline to current time for next check. + * + * Return: true if the time interval has passed, or false otherwise. + */ +static bool damon_check_reset_time_interval(struct timespec64 *baseline, + unsigned long interval) +{ + struct timespec64 now; + + ktime_get_coarse_ts64(&now); + if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < + interval * 1000) + return false; + *baseline = now; + return true; +} + +/* + * Check whether it is time to flush the aggregated information + */ +static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_aggregation, + ctx->attrs.aggr_interval); +} + +/* + * Reset the aggregated monitoring results ('nr_accesses' of each region). + */ +static void kdamond_reset_aggregated(struct damon_ctx *c) +{ + struct damon_target *t; + unsigned int ti = 0; /* target's index */ + + damon_for_each_target(t, c) { + struct damon_region *r; + + damon_for_each_region(r, t) { + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); + r->last_nr_accesses = r->nr_accesses; + r->nr_accesses = 0; + } + ti++; + } +} + +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r); + +static bool __damos_valid_target(struct damon_region *r, struct damos *s) +{ + unsigned long sz; + + sz = damon_sz_region(r); + return s->pattern.min_sz_region <= sz && + sz <= s->pattern.max_sz_region && + s->pattern.min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_age_region <= r->age && + r->age <= s->pattern.max_age_region; +} + +static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + bool ret = __damos_valid_target(r, s); + + if (!ret || !s->quota.esz || !c->ops.get_scheme_score) + return ret; + + return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; +} + +static void damon_do_apply_schemes(struct damon_ctx *c, + struct damon_target *t, + struct damon_region *r) +{ + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + unsigned long sz = damon_sz_region(r); + struct timespec64 begin, end; + unsigned long sz_applied = 0; + + if (!s->wmarks.activated) + continue; + + /* Check the quota */ + if (quota->esz && quota->charged_sz >= quota->esz) + continue; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + continue; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + continue; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + continue; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz) { + if (damon_sz_region(r) <= + DAMON_MIN_REGION) + continue; + sz = DAMON_MIN_REGION; + } + damon_split_region_at(t, r, sz); + r = damon_next_region(r); + sz = damon_sz_region(r); + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + + if (!damos_valid_target(c, t, r, s)) + continue; + + /* Apply the scheme */ + if (c->ops.apply_scheme) { + if (quota->esz && + quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(t, r, sz); + } + ktime_get_coarse_ts64(&begin); + sz_applied = c->ops.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; + } +} + +/* Shouldn't be called if quota->ms and quota->sz are zero */ +static void damos_set_effective_quota(struct damos_quota *quota) +{ + unsigned long throughput; + unsigned long esz; + + if (!quota->ms) { + quota->esz = quota->sz; + return; + } + + if (quota->total_charged_ns) + throughput = quota->total_charged_sz * 1000000 / + quota->total_charged_ns; + else + throughput = PAGE_SIZE * 1024; + esz = throughput * quota->ms; + + if (quota->sz && quota->sz < esz) + esz = quota->sz; + quota->esz = esz; +} + +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r, *next_r; + struct damos *s; + + damon_for_each_scheme(s, c) { + struct damos_quota *quota = &s->quota; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; + + if (!s->wmarks.activated) + continue; + + if (!quota->ms && !quota->sz) + continue; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies( + quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } + + if (!c->ops.get_scheme_score) + continue; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->ops.get_scheme_score( + c, t, r, s); + quota->histogram[score] += damon_sz_region(r); + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; + } + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next_r, t) + damon_do_apply_schemes(c, t, r); + } +} + +/* + * Merge two adjacent regions into one region + */ +static void damon_merge_two_regions(struct damon_target *t, + struct damon_region *l, struct damon_region *r) +{ + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); + + l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / + (sz_l + sz_r); + l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); + l->ar.end = r->ar.end; + damon_destroy_region(r, t); +} + +/* + * Merge adjacent regions having similar access frequencies + * + * t target affected by this merge operation + * thres '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + */ +static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, + unsigned long sz_limit) +{ + struct damon_region *r, *prev = NULL, *next; + + damon_for_each_region_safe(r, next, t) { + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) + r->age = 0; + else + r->age++; + + if (prev && prev->ar.end == r->ar.start && + abs(prev->nr_accesses - r->nr_accesses) <= thres && + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) + damon_merge_two_regions(t, prev, r); + else + prev = r; + } +} + +/* + * Merge adjacent regions having similar access frequencies + * + * threshold '->nr_accesses' diff threshold for the merge + * sz_limit size upper limit of each region + * + * This function merges monitoring target regions which are adjacent and their + * access frequencies are similar. This is for minimizing the monitoring + * overhead under the dynamically changeable access pattern. If a merge was + * unnecessarily made, later 'kdamond_split_regions()' will revert it. + */ +static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold, + unsigned long sz_limit) +{ + struct damon_target *t; + + damon_for_each_target(t, c) + damon_merge_regions_of(t, threshold, sz_limit); +} + +/* + * Split a region in two + * + * r the region to be split + * sz_r size of the first sub-region that will be made + */ +static void damon_split_region_at(struct damon_target *t, + struct damon_region *r, unsigned long sz_r) +{ + struct damon_region *new; + + new = damon_new_region(r->ar.start + sz_r, r->ar.end); + if (!new) + return; + + r->ar.end = new->ar.start; + + new->age = r->age; + new->last_nr_accesses = r->last_nr_accesses; + + damon_insert_region(new, r, damon_next_region(r), t); +} + +/* Split every region in the given target into 'nr_subs' regions */ +static void damon_split_regions_of(struct damon_target *t, int nr_subs) +{ + struct damon_region *r, *next; + unsigned long sz_region, sz_sub = 0; + int i; + + damon_for_each_region_safe(r, next, t) { + sz_region = damon_sz_region(r); + + for (i = 0; i < nr_subs - 1 && + sz_region > 2 * DAMON_MIN_REGION; i++) { + /* + * Randomly select size of left sub-region to be at + * least 10 percent and at most 90% of original region + */ + sz_sub = ALIGN_DOWN(damon_rand(1, 10) * + sz_region / 10, DAMON_MIN_REGION); + /* Do not allow blank region */ + if (sz_sub == 0 || sz_sub >= sz_region) + continue; + + damon_split_region_at(t, r, sz_sub); + sz_region = sz_sub; + } + } +} + +/* + * Split every target region into randomly-sized small regions + * + * This function splits every target region into random-sized small regions if + * current total number of the regions is equal or smaller than half of the + * user-specified maximum number of regions. This is for maximizing the + * monitoring accuracy under the dynamically changeable access patterns. If a + * split was unnecessarily made, later 'kdamond_merge_regions()' will revert + * it. + */ +static void kdamond_split_regions(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_regions = 0; + static unsigned int last_nr_regions; + int nr_subregions = 2; + + damon_for_each_target(t, ctx) + nr_regions += damon_nr_regions(t); + + if (nr_regions > ctx->attrs.max_nr_regions / 2) + return; + + /* Maybe the middle of the region has different access frequency */ + if (last_nr_regions == nr_regions && + nr_regions < ctx->attrs.max_nr_regions / 3) + nr_subregions = 3; + + damon_for_each_target(t, ctx) + damon_split_regions_of(t, nr_subregions); + + last_nr_regions = nr_regions; +} + +/* + * Check whether it is time to check and apply the operations-related data + * structures. + * + * Returns true if it is. + */ +static bool kdamond_need_update_operations(struct damon_ctx *ctx) +{ + return damon_check_reset_time_interval(&ctx->last_ops_update, + ctx->attrs.ops_update_interval); +} + +/* + * Check whether current monitoring should be stopped + * + * The monitoring is stopped when either the user requested to stop, or all + * monitoring targets are invalid. + * + * Returns true if need to stop current monitoring. + */ +static bool kdamond_need_stop(struct damon_ctx *ctx) +{ + struct damon_target *t; + + if (kthread_should_stop()) + return true; + + if (!ctx->ops.target_valid) + return false; + + damon_for_each_target(t, ctx) { + if (ctx->ops.target_valid(t)) + return false; + } + + return true; +} + +static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) +{ + struct sysinfo i; + + switch (metric) { + case DAMOS_WMARK_FREE_MEM_RATE: + si_meminfo(&i); + return i.freeram * 1000 / i.totalram; + default: + break; + } + return -EINVAL; +} + +/* + * Returns zero if the scheme is active. Else, returns time to wait for next + * watermark check in micro-seconds. + */ +static unsigned long damos_wmark_wait_us(struct damos *scheme) +{ + unsigned long metric; + + if (scheme->wmarks.metric == DAMOS_WMARK_NONE) + return 0; + + metric = damos_wmark_metric_value(scheme->wmarks.metric); + /* higher than high watermark or lower than low watermark */ + if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) { + if (scheme->wmarks.activated) + pr_debug("deactivate a scheme (%d) for %s wmark\n", + scheme->action, + metric > scheme->wmarks.high ? + "high" : "low"); + scheme->wmarks.activated = false; + return scheme->wmarks.interval; + } + + /* inactive and higher than middle watermark */ + if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) && + !scheme->wmarks.activated) + return scheme->wmarks.interval; + + if (!scheme->wmarks.activated) + pr_debug("activate a scheme (%d)\n", scheme->action); + scheme->wmarks.activated = true; + return 0; +} + +static void kdamond_usleep(unsigned long usecs) +{ + /* See Documentation/timers/timers-howto.rst for the thresholds */ + if (usecs > 20 * USEC_PER_MSEC) + schedule_timeout_idle(usecs_to_jiffies(usecs)); + else + usleep_idle_range(usecs, usecs + 1); +} + +/* Returns negative error code if it's not activated but should return */ +static int kdamond_wait_activation(struct damon_ctx *ctx) +{ + struct damos *s; + unsigned long wait_time; + unsigned long min_wait_time = 0; + bool init_wait_time = false; + + while (!kdamond_need_stop(ctx)) { + damon_for_each_scheme(s, ctx) { + wait_time = damos_wmark_wait_us(s); + if (!init_wait_time || wait_time < min_wait_time) { + init_wait_time = true; + min_wait_time = wait_time; + } + } + if (!min_wait_time) + return 0; + + kdamond_usleep(min_wait_time); + + if (ctx->callback.after_wmarks_check && + ctx->callback.after_wmarks_check(ctx)) + break; + } + return -EBUSY; +} + +/* + * The monitoring daemon that runs as a kernel thread + */ +static int kdamond_fn(void *data) +{ + struct damon_ctx *ctx = data; + struct damon_target *t; + struct damon_region *r, *next; + unsigned int max_nr_accesses = 0; + unsigned long sz_limit = 0; + + pr_debug("kdamond (%d) starts\n", current->pid); + + complete(&ctx->kdamond_started); + + if (ctx->ops.init) + ctx->ops.init(ctx); + if (ctx->callback.before_start && ctx->callback.before_start(ctx)) + goto done; + + sz_limit = damon_region_sz_limit(ctx); + + while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + break; + + if (ctx->ops.prepare_access_checks) + ctx->ops.prepare_access_checks(ctx); + if (ctx->callback.after_sampling && + ctx->callback.after_sampling(ctx)) + break; + + kdamond_usleep(ctx->attrs.sample_interval); + + if (ctx->ops.check_accesses) + max_nr_accesses = ctx->ops.check_accesses(ctx); + + if (kdamond_aggregate_interval_passed(ctx)) { + kdamond_merge_regions(ctx, + max_nr_accesses / 10, + sz_limit); + if (ctx->callback.after_aggregation && + ctx->callback.after_aggregation(ctx)) + break; + kdamond_apply_schemes(ctx); + kdamond_reset_aggregated(ctx); + kdamond_split_regions(ctx); + if (ctx->ops.reset_aggregated) + ctx->ops.reset_aggregated(ctx); + } + + if (kdamond_need_update_operations(ctx)) { + if (ctx->ops.update) + ctx->ops.update(ctx); + sz_limit = damon_region_sz_limit(ctx); + } + } +done: + damon_for_each_target(t, ctx) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + if (ctx->callback.before_terminate) + ctx->callback.before_terminate(ctx); + if (ctx->ops.cleanup) + ctx->ops.cleanup(ctx); + + pr_debug("kdamond (%d) finishes\n", current->pid); + mutex_lock(&ctx->kdamond_lock); + ctx->kdamond = NULL; + mutex_unlock(&ctx->kdamond_lock); + + mutex_lock(&damon_lock); + nr_running_ctxs--; + if (!nr_running_ctxs && running_exclusive_ctxs) + running_exclusive_ctxs = false; + mutex_unlock(&damon_lock); + + return 0; +} + +/* + * struct damon_system_ram_region - System RAM resource address region of + * [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_system_ram_region { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_system_ram_region *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool damon_find_biggest_system_ram(unsigned long *start, + unsigned long *end) + +{ + struct damon_system_ram_region arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +/** + * damon_set_region_biggest_system_ram_default() - Set the region of the given + * monitoring target as requested, or biggest 'System RAM'. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds the biggest + * 'System RAM' resource and sets the region to cover the resource. In the + * latter case, this function saves the start and end addresses of the resource + * in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_biggest_system_ram(start, end)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1); +} + +static int __init damon_init(void) +{ + damon_region_cache = KMEM_CACHE(damon_region, 0); + if (unlikely(!damon_region_cache)) { + pr_err("creating damon_region_cache fails\n"); + return -ENOMEM; + } + + return 0; +} + +subsys_initcall(damon_init); + +#include "core-test.h" diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h new file mode 100644 index 000000000..0bb0d532b --- /dev/null +++ b/mm/damon/dbgfs-test.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * DAMON Debugfs Interface Unit Tests + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_DBGFS_KUNIT_TEST + +#ifndef _DAMON_DBGFS_TEST_H +#define _DAMON_DBGFS_TEST_H + +#include + +static void damon_dbgfs_test_str_to_ints(struct kunit *test) +{ + char *question; + int *answers; + int expected[] = {12, 35, 46}; + ssize_t nr_integers = 0, i; + + question = "123"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123, answers[0]); + kfree(answers); + + question = "123abc"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers); + KUNIT_EXPECT_EQ(test, 123, answers[0]); + kfree(answers); + + question = "a123"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "12 35"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 46"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers); + for (i = 0; i < nr_integers; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = "12 35 abc 46"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers); + for (i = 0; i < 2; i++) + KUNIT_EXPECT_EQ(test, expected[i], answers[i]); + kfree(answers); + + question = ""; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); + + question = "\n"; + answers = str_to_ints(question, strlen(question), &nr_integers); + KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers); + kfree(answers); +} + +static void damon_dbgfs_test_set_targets(struct kunit *test) +{ + struct damon_ctx *ctx = dbgfs_new_ctx(); + char buf[64]; + + /* Make DAMON consider target has no pid */ + damon_select_ops(ctx, DAMON_OPS_PADDR); + + dbgfs_set_targets(ctx, 0, NULL); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_set_targets(ctx, 1, NULL); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n"); + + dbgfs_set_targets(ctx, 0, NULL); + sprint_target_ids(ctx, buf, 64); + KUNIT_EXPECT_STREQ(test, (char *)buf, "\n"); + + dbgfs_destroy_ctx(ctx); +} + +static void damon_dbgfs_test_set_init_regions(struct kunit *test) +{ + struct damon_ctx *ctx = damon_new_ctx(); + /* Each line represents one region in `` `` */ + char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45", + "1 10 20\n", + "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n", + ""}; + /* Reading the file again will show sorted, clean output */ + char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n", + "1 10 20\n", + "0 39 59\n0 70 134\n1 10 20\n1 20 25\n", + ""}; + char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */ + "1 10 20\n 1 14 26\n", /* regions overlap */ + "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */ + char *input, *expect; + int i, rc; + char buf[256]; + + damon_select_ops(ctx, DAMON_OPS_PADDR); + + dbgfs_set_targets(ctx, 3, NULL); + + /* Put valid inputs and check the results */ + for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) { + input = valid_inputs[i]; + expect = valid_expects[i]; + + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, 0); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, expect); + } + /* Put invalid inputs and check the return error code */ + for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) { + input = invalid_inputs[i]; + pr_info("input: %s\n", input); + rc = set_init_regions(ctx, input, strnlen(input, 256)); + KUNIT_EXPECT_EQ(test, rc, -EINVAL); + + memset(buf, 0, 256); + sprint_init_regions(ctx, buf, 256); + + KUNIT_EXPECT_STREQ(test, (char *)buf, ""); + } + + dbgfs_set_targets(ctx, 0, NULL); + damon_destroy_ctx(ctx); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_dbgfs_test_str_to_ints), + KUNIT_CASE(damon_dbgfs_test_set_targets), + KUNIT_CASE(damon_dbgfs_test_set_init_regions), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-dbgfs", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_TEST_H */ + +#endif /* CONFIG_DAMON_KUNIT_TEST */ diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c new file mode 100644 index 000000000..b3f454a5c --- /dev/null +++ b/mm/damon/dbgfs.c @@ -0,0 +1,1114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Debugfs Interface + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-dbgfs: " fmt + +#include +#include +#include +#include +#include +#include +#include + +static struct damon_ctx **dbgfs_ctxs; +static int dbgfs_nr_ctxs; +static struct dentry **dbgfs_dirs; +static DEFINE_MUTEX(damon_dbgfs_lock); + +/* + * Returns non-empty string on success, negative error code otherwise. + */ +static char *user_input_str(const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + + /* We do not accept continuous write */ + if (*ppos) + return ERR_PTR(-EINVAL); + + kbuf = kmalloc(count + 1, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return ERR_PTR(-ENOMEM); + + ret = simple_write_to_buffer(kbuf, count + 1, ppos, buf, count); + if (ret != count) { + kfree(kbuf); + return ERR_PTR(-EIO); + } + kbuf[ret] = '\0'; + + return kbuf; +} + +static ssize_t dbgfs_attrs_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char kbuf[128]; + int ret; + + mutex_lock(&ctx->kdamond_lock); + ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", + ctx->attrs.sample_interval, ctx->attrs.aggr_interval, + ctx->attrs.ops_update_interval, + ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); + mutex_unlock(&ctx->kdamond_lock); + + return simple_read_from_buffer(buf, count, ppos, kbuf, ret); +} + +static ssize_t dbgfs_attrs_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + struct damon_attrs attrs; + char *kbuf; + ssize_t ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (sscanf(kbuf, "%lu %lu %lu %lu %lu", + &attrs.sample_interval, &attrs.aggr_interval, + &attrs.ops_update_interval, + &attrs.min_nr_regions, + &attrs.max_nr_regions) != 5) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + ret = damon_set_attrs(ctx, &attrs); + if (!ret) + ret = count; +unlock_out: + mutex_unlock(&ctx->kdamond_lock); +out: + kfree(kbuf); + return ret; +} + +/* + * Return corresponding dbgfs' scheme action value (int) for the given + * damos_action if the given damos_action value is valid and supported by + * dbgfs, negative error code otherwise. + */ +static int damos_action_to_dbgfs_scheme_action(enum damos_action action) +{ + switch (action) { + case DAMOS_WILLNEED: + return 0; + case DAMOS_COLD: + return 1; + case DAMOS_PAGEOUT: + return 2; + case DAMOS_HUGEPAGE: + return 3; + case DAMOS_NOHUGEPAGE: + return 4; + case DAMOS_STAT: + return 5; + default: + return -EINVAL; + } +} + +static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damos *s; + int written = 0; + int rc; + + damon_for_each_scheme(s, c) { + rc = scnprintf(&buf[written], len - written, + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + s->pattern.min_sz_region, + s->pattern.max_sz_region, + s->pattern.min_nr_accesses, + s->pattern.max_nr_accesses, + s->pattern.min_age_region, + s->pattern.max_age_region, + damos_action_to_dbgfs_scheme_action(s->action), + s->quota.ms, s->quota.sz, + s->quota.reset_interval, + s->quota.weight_sz, + s->quota.weight_nr_accesses, + s->quota.weight_age, + s->wmarks.metric, s->wmarks.interval, + s->wmarks.high, s->wmarks.mid, s->wmarks.low, + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); + if (!rc) + return -ENOMEM; + + written += rc; + } + return written; +} + +static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_schemes(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) +{ + ssize_t i; + + for (i = 0; i < nr_schemes; i++) + kfree(schemes[i]); + kfree(schemes); +} + +/* + * Return corresponding damos_action for the given dbgfs input for a scheme + * action if the input is valid, negative error code otherwise. + */ +static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) +{ + switch (dbgfs_action) { + case 0: + return DAMOS_WILLNEED; + case 1: + return DAMOS_COLD; + case 2: + return DAMOS_PAGEOUT; + case 3: + return DAMOS_HUGEPAGE; + case 4: + return DAMOS_NOHUGEPAGE; + case 5: + return DAMOS_STAT; + default: + return -EINVAL; + } +} + +/* + * Converts a string into an array of struct damos pointers + * + * Returns an array of struct damos pointers that converted if the conversion + * success, or NULL otherwise. + */ +static struct damos **str_to_schemes(const char *str, ssize_t len, + ssize_t *nr_schemes) +{ + struct damos *scheme, **schemes; + const int max_nr_schemes = 256; + int pos = 0, parsed, ret; + unsigned int action_input; + enum damos_action action; + + schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), + GFP_KERNEL); + if (!schemes) + return NULL; + + *nr_schemes = 0; + while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_access_pattern pattern = {}; + struct damos_quota quota = {}; + struct damos_watermarks wmarks; + + ret = sscanf(&str[pos], + "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", + &pattern.min_sz_region, &pattern.max_sz_region, + &pattern.min_nr_accesses, + &pattern.max_nr_accesses, + &pattern.min_age_region, + &pattern.max_age_region, + &action_input, "a.ms, + "a.sz, "a.reset_interval, + "a.weight_sz, "a.weight_nr_accesses, + "a.weight_age, &wmarks.metric, + &wmarks.interval, &wmarks.high, &wmarks.mid, + &wmarks.low, &parsed); + if (ret != 18) + break; + action = dbgfs_scheme_action_to_damos_action(action_input); + if ((int)action < 0) + goto fail; + + if (pattern.min_sz_region > pattern.max_sz_region || + pattern.min_nr_accesses > pattern.max_nr_accesses || + pattern.min_age_region > pattern.max_age_region) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + + pos += parsed; + scheme = damon_new_scheme(&pattern, action, "a, &wmarks); + if (!scheme) + goto fail; + + schemes[*nr_schemes] = scheme; + *nr_schemes += 1; + } + return schemes; +fail: + free_schemes_arr(schemes, *nr_schemes); + return NULL; +} + +static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + struct damos **schemes; + ssize_t nr_schemes = 0, ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + schemes = str_to_schemes(kbuf, count, &nr_schemes); + if (!schemes) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + damon_set_schemes(ctx, schemes, nr_schemes); + ret = count; + nr_schemes = 0; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + free_schemes_arr(schemes, nr_schemes); +out: + kfree(kbuf); + return ret; +} + +static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) +{ + struct damon_target *t; + int id; + int written = 0; + int rc; + + damon_for_each_target(t, ctx) { + if (damon_target_has_pid(ctx)) + /* Show pid numbers to debugfs users */ + id = pid_vnr(t->pid); + else + /* Show 42 for physical address space, just for fun */ + id = 42; + + rc = scnprintf(&buf[written], len - written, "%d ", id); + if (!rc) + return -ENOMEM; + written += rc; + } + if (written) + written -= 1; + written += scnprintf(&buf[written], len - written, "\n"); + return written; +} + +static ssize_t dbgfs_target_ids_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + ssize_t len; + char ids_buf[320]; + + mutex_lock(&ctx->kdamond_lock); + len = sprint_target_ids(ctx, ids_buf, 320); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + return len; + + return simple_read_from_buffer(buf, count, ppos, ids_buf, len); +} + +/* + * Converts a string into an integers array + * + * Returns an array of integers array if the conversion success, or NULL + * otherwise. + */ +static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints) +{ + int *array; + const int max_nr_ints = 32; + int nr; + int pos = 0, parsed, ret; + + *nr_ints = 0; + array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL); + if (!array) + return NULL; + while (*nr_ints < max_nr_ints && pos < len) { + ret = sscanf(&str[pos], "%d%n", &nr, &parsed); + pos += parsed; + if (ret != 1) + break; + array[*nr_ints] = nr; + *nr_ints += 1; + } + + return array; +} + +static void dbgfs_put_pids(struct pid **pids, int nr_pids) +{ + int i; + + for (i = 0; i < nr_pids; i++) + put_pid(pids[i]); +} + +/* + * Converts a string into an struct pid pointers array + * + * Returns an array of struct pid pointers if the conversion success, or NULL + * otherwise. + */ +static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids) +{ + int *ints; + ssize_t nr_ints; + struct pid **pids; + + *nr_pids = 0; + + ints = str_to_ints(str, len, &nr_ints); + if (!ints) + return NULL; + + pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL); + if (!pids) + goto out; + + for (; *nr_pids < nr_ints; (*nr_pids)++) { + pids[*nr_pids] = find_get_pid(ints[*nr_pids]); + if (!pids[*nr_pids]) { + dbgfs_put_pids(pids, *nr_pids); + kfree(ints); + kfree(pids); + return NULL; + } + } + +out: + kfree(ints); + return pids; +} + +/* + * dbgfs_set_targets() - Set monitoring targets. + * @ctx: monitoring context + * @nr_targets: number of targets + * @pids: array of target pids (size is same to @nr_targets) + * + * This function should not be called while the kdamond is running. @pids is + * ignored if the context is not configured to have pid in each target. On + * failure, reference counts of all pids in @pids are decremented. + * + * Return: 0 on success, negative error code otherwise. + */ +static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, + struct pid **pids) +{ + ssize_t i; + struct damon_target *t, *next; + + damon_for_each_target_safe(t, next, ctx) { + if (damon_target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + + for (i = 0; i < nr_targets; i++) { + t = damon_new_target(); + if (!t) { + damon_for_each_target_safe(t, next, ctx) + damon_destroy_target(t); + if (damon_target_has_pid(ctx)) + dbgfs_put_pids(pids, nr_targets); + return -ENOMEM; + } + if (damon_target_has_pid(ctx)) + t->pid = pids[i]; + damon_add_target(ctx, t); + } + + return 0; +} + +static ssize_t dbgfs_target_ids_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + bool id_is_pid = true; + char *kbuf; + struct pid **target_pids = NULL; + ssize_t nr_targets; + ssize_t ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + if (!strncmp(kbuf, "paddr\n", count)) { + id_is_pid = false; + nr_targets = 1; + } + + if (id_is_pid) { + target_pids = str_to_pids(kbuf, count, &nr_targets); + if (!target_pids) { + ret = -ENOMEM; + goto out; + } + } + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + if (id_is_pid) + dbgfs_put_pids(target_pids, nr_targets); + ret = -EBUSY; + goto unlock_out; + } + + /* remove previously set targets */ + dbgfs_set_targets(ctx, 0, NULL); + if (!nr_targets) { + ret = count; + goto unlock_out; + } + + /* Configure the context for the address space type */ + if (id_is_pid) + ret = damon_select_ops(ctx, DAMON_OPS_VADDR); + else + ret = damon_select_ops(ctx, DAMON_OPS_PADDR); + if (ret) + goto unlock_out; + + ret = dbgfs_set_targets(ctx, nr_targets, target_pids); + if (!ret) + ret = count; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(target_pids); +out: + kfree(kbuf); + return ret; +} + +static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r; + int target_idx = 0; + int written = 0; + int rc; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + rc = scnprintf(&buf[written], len - written, + "%d %lu %lu\n", + target_idx, r->ar.start, r->ar.end); + if (!rc) + return -ENOMEM; + written += rc; + } + target_idx++; + } + return written; +} + +static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + mutex_unlock(&ctx->kdamond_lock); + len = -EBUSY; + goto out; + } + + len = sprint_init_regions(ctx, kbuf, count); + mutex_unlock(&ctx->kdamond_lock); + if (len < 0) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int add_init_region(struct damon_ctx *c, int target_idx, + struct damon_addr_range *ar) +{ + struct damon_target *t; + struct damon_region *r, *prev; + unsigned long idx = 0; + int rc = -EINVAL; + + if (ar->start >= ar->end) + return -EINVAL; + + damon_for_each_target(t, c) { + if (idx++ == target_idx) { + r = damon_new_region(ar->start, ar->end); + if (!r) + return -ENOMEM; + damon_add_region(r, t); + if (damon_nr_regions(t) > 1) { + prev = damon_prev_region(r); + if (prev->ar.end > r->ar.start) { + damon_destroy_region(r, t); + return -EINVAL; + } + } + rc = 0; + } + } + return rc; +} + +static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len) +{ + struct damon_target *t; + struct damon_region *r, *next; + int pos = 0, parsed, ret; + int target_idx; + struct damon_addr_range ar; + int err; + + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + + while (pos < len) { + ret = sscanf(&str[pos], "%d %lu %lu%n", + &target_idx, &ar.start, &ar.end, &parsed); + if (ret != 3) + break; + err = add_init_region(c, target_idx, &ar); + if (err) + goto fail; + pos += parsed; + } + + return 0; + +fail: + damon_for_each_target(t, c) { + damon_for_each_region_safe(r, next, t) + damon_destroy_region(r, t); + } + return err; +} + +static ssize_t dbgfs_init_regions_write(struct file *file, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t ret = count; + int err; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) { + ret = -EBUSY; + goto unlock_out; + } + + err = set_init_regions(ctx, kbuf, ret); + if (err) + ret = err; + +unlock_out: + mutex_unlock(&ctx->kdamond_lock); + kfree(kbuf); + return ret; +} + +static ssize_t dbgfs_kdamond_pid_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + struct damon_ctx *ctx = file->private_data; + char *kbuf; + ssize_t len; + + kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN); + if (!kbuf) + return -ENOMEM; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + len = scnprintf(kbuf, count, "%d\n", ctx->kdamond->pid); + else + len = scnprintf(kbuf, count, "none\n"); + mutex_unlock(&ctx->kdamond_lock); + if (!len) + goto out; + len = simple_read_from_buffer(buf, count, ppos, kbuf, len); + +out: + kfree(kbuf); + return len; +} + +static int damon_dbgfs_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + + return nonseekable_open(inode, file); +} + +static const struct file_operations attrs_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_attrs_read, + .write = dbgfs_attrs_write, +}; + +static const struct file_operations schemes_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_schemes_read, + .write = dbgfs_schemes_write, +}; + +static const struct file_operations target_ids_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_target_ids_read, + .write = dbgfs_target_ids_write, +}; + +static const struct file_operations init_regions_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_init_regions_read, + .write = dbgfs_init_regions_write, +}; + +static const struct file_operations kdamond_pid_fops = { + .open = damon_dbgfs_open, + .read = dbgfs_kdamond_pid_read, +}; + +static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx) +{ + const char * const file_names[] = {"attrs", "schemes", "target_ids", + "init_regions", "kdamond_pid"}; + const struct file_operations *fops[] = {&attrs_fops, &schemes_fops, + &target_ids_fops, &init_regions_fops, &kdamond_pid_fops}; + int i; + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]); +} + +static void dbgfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!damon_target_has_pid(ctx)) + return; + + mutex_lock(&ctx->kdamond_lock); + damon_for_each_target_safe(t, next, ctx) { + put_pid(t->pid); + damon_destroy_target(t); + } + mutex_unlock(&ctx->kdamond_lock); +} + +static struct damon_ctx *dbgfs_new_ctx(void) +{ + struct damon_ctx *ctx; + + ctx = damon_new_ctx(); + if (!ctx) + return NULL; + + if (damon_select_ops(ctx, DAMON_OPS_VADDR) && + damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return NULL; + } + ctx->callback.before_terminate = dbgfs_before_terminate; + return ctx; +} + +static void dbgfs_destroy_ctx(struct damon_ctx *ctx) +{ + damon_destroy_ctx(ctx); +} + +/* + * Make a context of @name and create a debugfs directory for it. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Returns 0 on success, negative error code otherwise. + */ +static int dbgfs_mk_context(char *name) +{ + struct dentry *root, **new_dirs, *new_dir; + struct damon_ctx **new_ctxs, *new_ctx; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + new_ctxs = krealloc(dbgfs_ctxs, sizeof(*dbgfs_ctxs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_ctxs) + return -ENOMEM; + dbgfs_ctxs = new_ctxs; + + new_dirs = krealloc(dbgfs_dirs, sizeof(*dbgfs_dirs) * + (dbgfs_nr_ctxs + 1), GFP_KERNEL); + if (!new_dirs) + return -ENOMEM; + dbgfs_dirs = new_dirs; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + new_dir = debugfs_create_dir(name, root); + /* Below check is required for a potential duplicated name case */ + if (IS_ERR(new_dir)) + return PTR_ERR(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = new_dir; + + new_ctx = dbgfs_new_ctx(); + if (!new_ctx) { + debugfs_remove(new_dir); + dbgfs_dirs[dbgfs_nr_ctxs] = NULL; + return -ENOMEM; + } + + dbgfs_ctxs[dbgfs_nr_ctxs] = new_ctx; + dbgfs_fill_ctx_dir(dbgfs_dirs[dbgfs_nr_ctxs], + dbgfs_ctxs[dbgfs_nr_ctxs]); + dbgfs_nr_ctxs++; + + return 0; +} + +static ssize_t dbgfs_mk_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + char *ctx_name; + ssize_t ret; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + ret = dbgfs_mk_context(ctx_name); + if (!ret) + ret = count; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +/* + * Remove a context of @name and its debugfs directory. + * + * This function should be called while holding damon_dbgfs_lock. + * + * Return 0 on success, negative error code otherwise. + */ +static int dbgfs_rm_context(char *name) +{ + struct dentry *root, *dir, **new_dirs; + struct inode *inode; + struct damon_ctx **new_ctxs; + int i, j; + int ret = 0; + + if (damon_nr_running_ctxs()) + return -EBUSY; + + root = dbgfs_dirs[0]; + if (!root) + return -ENOENT; + + dir = debugfs_lookup(name, root); + if (!dir) + return -ENOENT; + + inode = d_inode(dir); + if (!S_ISDIR(inode->i_mode)) { + ret = -EINVAL; + goto out_dput; + } + + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), + GFP_KERNEL); + if (!new_dirs) { + ret = -ENOMEM; + goto out_dput; + } + + new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), + GFP_KERNEL); + if (!new_ctxs) { + ret = -ENOMEM; + goto out_new_dirs; + } + + for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { + if (dbgfs_dirs[i] == dir) { + debugfs_remove(dbgfs_dirs[i]); + dbgfs_destroy_ctx(dbgfs_ctxs[i]); + continue; + } + new_dirs[j] = dbgfs_dirs[i]; + new_ctxs[j++] = dbgfs_ctxs[i]; + } + + kfree(dbgfs_dirs); + kfree(dbgfs_ctxs); + + dbgfs_dirs = new_dirs; + dbgfs_ctxs = new_ctxs; + dbgfs_nr_ctxs--; + + goto out_dput; + +out_new_dirs: + kfree(new_dirs); +out_dput: + dput(dir); + return ret; +} + +static ssize_t dbgfs_rm_context_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t ret; + char *ctx_name; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + ctx_name = kmalloc(count + 1, GFP_KERNEL); + if (!ctx_name) { + kfree(kbuf); + return -ENOMEM; + } + + /* Trim white space */ + if (sscanf(kbuf, "%s", ctx_name) != 1) { + ret = -EINVAL; + goto out; + } + + mutex_lock(&damon_dbgfs_lock); + ret = dbgfs_rm_context(ctx_name); + if (!ret) + ret = count; + mutex_unlock(&damon_dbgfs_lock); + +out: + kfree(kbuf); + kfree(ctx_name); + return ret; +} + +static ssize_t dbgfs_monitor_on_read(struct file *file, + char __user *buf, size_t count, loff_t *ppos) +{ + char monitor_on_buf[5]; + bool monitor_on = damon_nr_running_ctxs() != 0; + int len; + + len = scnprintf(monitor_on_buf, 5, monitor_on ? "on\n" : "off\n"); + + return simple_read_from_buffer(buf, count, ppos, monitor_on_buf, len); +} + +static ssize_t dbgfs_monitor_on_write(struct file *file, + const char __user *buf, size_t count, loff_t *ppos) +{ + ssize_t ret; + char *kbuf; + + kbuf = user_input_str(buf, count, ppos); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + /* Remove white space */ + if (sscanf(kbuf, "%s", kbuf) != 1) { + kfree(kbuf); + return -EINVAL; + } + + mutex_lock(&damon_dbgfs_lock); + if (!strncmp(kbuf, "on", count)) { + int i; + + for (i = 0; i < dbgfs_nr_ctxs; i++) { + if (damon_targets_empty(dbgfs_ctxs[i])) { + kfree(kbuf); + mutex_unlock(&damon_dbgfs_lock); + return -EINVAL; + } + } + ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true); + } else if (!strncmp(kbuf, "off", count)) { + ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs); + } else { + ret = -EINVAL; + } + mutex_unlock(&damon_dbgfs_lock); + + if (!ret) + ret = count; + kfree(kbuf); + return ret; +} + +static const struct file_operations mk_contexts_fops = { + .write = dbgfs_mk_context_write, +}; + +static const struct file_operations rm_contexts_fops = { + .write = dbgfs_rm_context_write, +}; + +static const struct file_operations monitor_on_fops = { + .read = dbgfs_monitor_on_read, + .write = dbgfs_monitor_on_write, +}; + +static int __init __damon_dbgfs_init(void) +{ + struct dentry *dbgfs_root; + const char * const file_names[] = {"mk_contexts", "rm_contexts", + "monitor_on"}; + const struct file_operations *fops[] = {&mk_contexts_fops, + &rm_contexts_fops, &monitor_on_fops}; + int i; + + dbgfs_root = debugfs_create_dir("damon", NULL); + + for (i = 0; i < ARRAY_SIZE(file_names); i++) + debugfs_create_file(file_names[i], 0600, dbgfs_root, NULL, + fops[i]); + dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); + + dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); + if (!dbgfs_dirs) { + debugfs_remove(dbgfs_root); + return -ENOMEM; + } + dbgfs_dirs[0] = dbgfs_root; + + return 0; +} + +/* + * Functions for the initialization + */ + +static int __init damon_dbgfs_init(void) +{ + int rc = -ENOMEM; + + mutex_lock(&damon_dbgfs_lock); + dbgfs_ctxs = kmalloc(sizeof(*dbgfs_ctxs), GFP_KERNEL); + if (!dbgfs_ctxs) + goto out; + dbgfs_ctxs[0] = dbgfs_new_ctx(); + if (!dbgfs_ctxs[0]) { + kfree(dbgfs_ctxs); + goto out; + } + dbgfs_nr_ctxs = 1; + + rc = __damon_dbgfs_init(); + if (rc) { + kfree(dbgfs_ctxs[0]); + kfree(dbgfs_ctxs); + pr_err("%s: dbgfs init failed\n", __func__); + } + +out: + mutex_unlock(&damon_dbgfs_lock); + return rc; +} + +module_init(damon_dbgfs_init); + +#include "dbgfs-test.h" diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c new file mode 100644 index 000000000..63bdad20d --- /dev/null +++ b/mm/damon/lru_sort.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based LRU-lists Sorting + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-lru-sort: " fmt + +#include +#include +#include +#include +#include + +#include "modules-common.h" + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_lru_sort." + +/* + * Enable or disable DAMON_LRU_SORT. + * + * You can enable DAMON_LRU_SORT by setting the value of this parameter as + * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that + * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the + * watermarks-based activation condition. Refer to below descriptions for the + * watermarks parameter for this. + */ +static bool enabled __read_mostly; + +/* + * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_LRU_SORT is running are not + * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT + * reads values of parametrs except ``enabled`` again. Once the re-reading is + * done, this parameter is set as ``N``. If invalid parameters are found while + * the re-reading, DAMON_LRU_SORT will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + +/* + * Access frequency threshold for hot memory regions identification in permil. + * + * If a memory region is accessed in frequency of this or higher, + * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the + * LRU list, so that it could not be reclaimed under memory pressure. 50% by + * default. + */ +static unsigned long hot_thres_access_freq = 500; +module_param(hot_thres_access_freq, ulong, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT + * identifies the region as cold, and mark it as unaccessed on the LRU list, so + * that it could be reclaimed first under memory pressure. 120 seconds by + * default. + */ +static unsigned long cold_min_age __read_mostly = 120000000; +module_param(cold_min_age, ulong, 0600); + +static struct damos_quota damon_lru_sort_quota = { + /* Use up to 10 ms per 1 sec, by default */ + .ms = 10, + .sz = 0, + .reset_interval = 1000, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, +}; +DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); + +static struct damos_watermarks damon_lru_sort_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 200, /* 20 percent */ + .mid = 150, /* 15 percent */ + .low = 50, /* 5 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks); + +static struct damon_attrs damon_lru_sort_mon_attrs = { + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +static struct damos_stat damon_lru_sort_hot_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, + lru_sort_tried_hot_regions, lru_sorted_hot_regions, + hot_quota_exceeds); + +static struct damos_stat damon_lru_sort_cold_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, + lru_sort_tried_cold_regions, lru_sorted_cold_regions, + cold_quota_exceeds); + +static struct damos_access_pattern damon_lru_sort_stub_pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* no matter its access frequency */ + .min_nr_accesses = 0, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, +}; + +static struct damon_ctx *ctx; +static struct damon_target *target; + +static struct damos *damon_lru_sort_new_scheme( + struct damos_access_pattern *pattern, enum damos_action action) +{ + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot/cold pages sorting */ + quota.ms = quota.ms / 2; + + return damon_new_scheme( + /* find the pattern, and */ + pattern, + /* (de)prioritize on LRU-lists */ + action, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &damon_lru_sort_wmarks); +} + +/* Create a DAMON-based operation scheme for hot memory regions */ +static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) +{ + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + + pattern.min_nr_accesses = hot_thres; + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); +} + +/* Create a DAMON-based operation scheme for cold memory regions */ +static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) +{ + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + + pattern.max_nr_accesses = 0; + pattern.min_age_region = cold_thres; + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); +} + +static int damon_lru_sort_apply_parameters(void) +{ + struct damos *scheme; + unsigned int hot_thres, cold_thres; + int err = 0; + + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); + if (err) + return err; + + hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * + hot_thres_access_freq / 1000; + scheme = damon_lru_sort_new_hot_scheme(hot_thres); + if (!scheme) + return -ENOMEM; + damon_set_schemes(ctx, &scheme, 1); + + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; + scheme = damon_lru_sort_new_cold_scheme(cold_thres); + if (!scheme) + return -ENOMEM; + damon_add_scheme(ctx, scheme); + + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); +} + +static int damon_lru_sort_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_lru_sort_apply_parameters(); + if (err) + return err; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; +} + +static struct delayed_work damon_lru_sort_timer; +static void damon_lru_sort_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_lru_sort_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } +} +static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); + +static bool damon_lru_sort_initialized; + +static int damon_lru_sort_enabled_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + + if (!damon_lru_sort_initialized) + return rc; + + schedule_delayed_work(&damon_lru_sort_timer, 0); + + return 0; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_lru_sort_enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_LRU_SORT (default: disabled)"); + +static int damon_lru_sort_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_lru_sort_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_lru_sort_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; + } + + return damon_lru_sort_handle_commit_inputs(); +} + +static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) +{ + return damon_lru_sort_handle_commit_inputs(); +} + +static int __init damon_lru_sort_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; + ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_lru_sort_timer, 0); + + damon_lru_sort_initialized = true; + return 0; +} + +module_init(damon_lru_sort_init); diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h new file mode 100644 index 000000000..5a4921851 --- /dev/null +++ b/mm/damon/modules-common.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \ + module_param_named(sample_interval, attrs.sample_interval, \ + ulong, 0600); \ + module_param_named(aggr_interval, attrs.aggr_interval, ulong, \ + 0600); \ + module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \ + 0600); \ + module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ + 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_ms, quota.ms, ulong, 0600); \ + module_param_named(quota_reset_interval_ms, \ + quota.reset_interval, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_sz, quota.sz, ulong, 0600); + +#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ + module_param_named(wmarks_interval, wmarks.interval, ulong, \ + 0600); \ + module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ + module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ + module_param_named(wmarks_low, wmarks.low, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \ + succ_name, qt_exceed_name) \ + module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \ + module_param_named(bytes_##try_name, stat.sz_tried, ulong, \ + 0400); \ + module_param_named(nr_##succ_name, stat.nr_applied, ulong, \ + 0400); \ + module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ + 0400); \ + module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ + 0400); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c new file mode 100644 index 000000000..0b75a8d5c --- /dev/null +++ b/mm/damon/ops-common.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include +#include +#include +#include + +#include "ops-common.h" + +/* + * Get an online page for a pfn if it's in the LRU list. Otherwise, returns + * NULL. + * + * The body of this function is stolen from the 'page_idle_get_page()'. We + * steal rather than reuse it because the code is quite simple. + */ +struct page *damon_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + struct page *page = damon_get_page(pte_pfn(*pte)); + + if (!page) + return; + + if (ptep_test_and_clear_young(vma, addr, pte)) + referenced = true; + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + PAGE_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + bool referenced = false; + struct page *page = damon_get_page(pmd_pfn(*pmd)); + + if (!page) + return; + + if (pmdp_test_and_clear_young(vma, addr, pmd)) + referenced = true; + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(vma->vm_mm, addr, addr + HPAGE_PMD_SIZE)) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} + +#define DAMON_MAX_SUBSCORE (100) +#define DAMON_MAX_AGE_IN_LOG (32) + +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / + damon_max_nr_accesses(&c->attrs); + + age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + return hotness; +} + +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + int hotness = damon_hot_score(c, r, s); + + /* Return coldness of the region */ + return DAMOS_MAX_SCORE - hotness; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h new file mode 100644 index 000000000..e062a8874 --- /dev/null +++ b/mm/damon/ops-common.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for Data Access Monitoring + * + * Author: SeongJae Park + */ + +#include + +struct page *damon_get_page(unsigned long pfn); + +void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr); +void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr); + +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c new file mode 100644 index 000000000..7bc8d79c7 --- /dev/null +++ b/mm/damon/paddr.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for The Physical Address Space + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-pa: " fmt + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "ops-common.h" + +static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) + damon_ptep_mkold(pvmw.pte, vma, addr); + else + damon_pmdp_mkold(pvmw.pmd, vma, addr); + } + return true; +} + +static void damon_pa_mkold(unsigned long paddr) +{ + struct folio *folio; + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct rmap_walk_control rwc = { + .rmap_one = __damon_pa_mkold, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return; + folio = page_folio(page); + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + folio_set_idle(folio); + goto out; + } + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + goto out; + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); + +out: + folio_put(folio); +} + +static void __damon_pa_prepare_access_check(struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_pa_mkold(r->sampling_addr); +} + +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) + __damon_pa_prepare_access_check(r); + } +} + +struct damon_pa_access_chk_result { + unsigned long page_sz; + bool accessed; +}; + +static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct damon_pa_access_chk_result *result = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + + result->accessed = false; + result->page_sz = PAGE_SIZE; + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + result->accessed = pte_young(*pvmw.pte) || + !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + result->accessed = pmd_young(*pvmw.pmd) || + !folio_test_idle(folio) || + mmu_notifier_test_young(vma->vm_mm, addr); + result->page_sz = HPAGE_PMD_SIZE; +#else + WARN_ON_ONCE(1); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + } + if (result->accessed) { + page_vma_mapped_walk_done(&pvmw); + break; + } + } + + /* If accessed, stop walking */ + return !result->accessed; +} + +static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz) +{ + struct folio *folio; + struct page *page = damon_get_page(PHYS_PFN(paddr)); + struct damon_pa_access_chk_result result = { + .page_sz = PAGE_SIZE, + .accessed = false, + }; + struct rmap_walk_control rwc = { + .arg = &result, + .rmap_one = __damon_pa_young, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!page) + return false; + folio = page_folio(page); + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) { + if (folio_test_idle(folio)) + result.accessed = false; + else + result.accessed = true; + folio_put(folio); + goto out; + } + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) { + folio_put(folio); + return false; + } + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); + folio_put(folio); + +out: + *page_sz = result.page_sz; + return result.accessed; +} + +static void __damon_pa_check_access(struct damon_region *r) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz)) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + + damon_for_each_target(t, ctx) { + damon_for_each_region(r, t) { + __damon_pa_check_access(r); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + } + } + + return max_nr_accesses; +} + +static unsigned long damon_pa_pageout(struct damon_region *r) +{ + unsigned long addr, applied; + LIST_HEAD(page_list); + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + put_page(page); + } + applied = reclaim_pages(&page_list); + cond_resched(); + return applied * PAGE_SIZE; +} + +static inline unsigned long damon_pa_mark_accessed_or_deactivate( + struct damon_region *r, bool mark_accessed) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + if (mark_accessed) + mark_page_accessed(page); + else + deactivate_page(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + +static unsigned long damon_pa_mark_accessed(struct damon_region *r) +{ + return damon_pa_mark_accessed_or_deactivate(r, true); +} + +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + return damon_pa_mark_accessed_or_deactivate(r, false); +} + +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pa_pageout(r); + case DAMOS_LRU_PRIO: + return damon_pa_mark_accessed(r); + case DAMOS_LRU_DEPRIO: + return damon_pa_deactivate_pages(r); + case DAMOS_STAT: + break; + default: + /* DAMOS actions that not yet supported by 'paddr'. */ + break; + } + return 0; +} + +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_cold_score(context, r, scheme); + case DAMOS_LRU_PRIO: + return damon_hot_score(context, r, scheme); + case DAMOS_LRU_DEPRIO: + return damon_cold_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + +static int __init damon_pa_initcall(void) +{ + struct damon_operations ops = { + .id = DAMON_OPS_PADDR, + .init = NULL, + .update = NULL, + .prepare_access_checks = damon_pa_prepare_access_checks, + .check_accesses = damon_pa_check_accesses, + .reset_aggregated = NULL, + .target_valid = NULL, + .cleanup = NULL, + .apply_scheme = damon_pa_apply_scheme, + .get_scheme_score = damon_pa_scheme_score, + }; + + return damon_register_ops(&ops); +}; + +subsys_initcall(damon_pa_initcall); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c new file mode 100644 index 000000000..162c9b1ca --- /dev/null +++ b/mm/damon/reclaim.c @@ -0,0 +1,284 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based page reclamation + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-reclaim: " fmt + +#include +#include +#include +#include +#include + +#include "modules-common.h" + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_reclaim." + +/* + * Enable or disable DAMON_RECLAIM. + * + * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``. + * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could + * do no real monitoring and reclamation due to the watermarks-based activation + * condition. Refer to below descriptions for the watermarks parameter for + * this. + */ +static bool enabled __read_mostly; + +/* + * Make DAMON_RECLAIM reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_RECLAIM is running are not applied + * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values + * of parametrs except ``enabled`` again. Once the re-reading is done, this + * parameter is set as ``N``. If invalid parameters are found while the + * re-reading, DAMON_RECLAIM will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_RECLAIM + * identifies the region as cold, and reclaims. 120 seconds by default. + */ +static unsigned long min_age __read_mostly = 120000000; +module_param(min_age, ulong, 0600); + +static struct damos_quota damon_reclaim_quota = { + /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */ + .ms = 10, + .sz = 128 * 1024 * 1024, + .reset_interval = 1000, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 +}; +DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); + +static struct damos_watermarks damon_reclaim_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 500, /* 50 percent */ + .mid = 400, /* 40 percent */ + .low = 200, /* 20 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks); + +static struct damon_attrs damon_reclaim_mon_attrs = { + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_RECLAIM will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +static struct damos_stat damon_reclaim_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, + reclaim_tried_regions, reclaimed_regions, quota_exceeds); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +static struct damos *damon_reclaim_new_scheme(void) +{ + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = min_age / + damon_reclaim_mon_attrs.aggr_interval, + .max_age_region = UINT_MAX, + }; + + return damon_new_scheme( + &pattern, + /* page out those, as soon as found */ + DAMOS_PAGEOUT, + /* under the quota. */ + &damon_reclaim_quota, + /* (De)activate this according to the watermarks. */ + &damon_reclaim_wmarks); +} + +static int damon_reclaim_apply_parameters(void) +{ + struct damos *scheme; + int err = 0; + + err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); + if (err) + return err; + + /* Will be freed by next 'damon_set_schemes()' below */ + scheme = damon_reclaim_new_scheme(); + if (!scheme) + return -ENOMEM; + damon_set_schemes(ctx, &scheme, 1); + + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); +} + +static int damon_reclaim_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_reclaim_apply_parameters(); + if (err) + return err; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; +} + +static struct delayed_work damon_reclaim_timer; +static void damon_reclaim_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_reclaim_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } +} +static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); + +static bool damon_reclaim_initialized; + +static int damon_reclaim_enabled_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + + /* system_wq might not initialized yet */ + if (!damon_reclaim_initialized) + return rc; + + schedule_delayed_work(&damon_reclaim_timer, 0); + return 0; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_reclaim_enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_RECLAIM (default: disabled)"); + +static int damon_reclaim_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; + + return damon_reclaim_handle_commit_inputs(); +} + +static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) +{ + return damon_reclaim_handle_commit_inputs(); +} + +static int __init damon_reclaim_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_reclaim_timer, 0); + + damon_reclaim_initialized = true; + return 0; +} + +module_init(damon_reclaim_init); diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c new file mode 100644 index 000000000..9ea21b6d2 --- /dev/null +++ b/mm/damon/sysfs.c @@ -0,0 +1,2909 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park + */ + +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(damon_sysfs_lock); + +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return err; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return err; + + range->max = max; + return count; +} + +static void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +static struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + return err ? err : count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + return err ? err : count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + return err ? err : count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + return err ? err : count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + return err ? err : count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + return err ? err : count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + return err ? err : count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_stats *stats; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "lru_prio", + "lru_deprio", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + return 0; + +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->stats->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +static struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +/* + * init region directory + */ + +struct damon_sysfs_region { + struct kobject kobj; + unsigned long start; + unsigned long end; +}; + +static struct damon_sysfs_region *damon_sysfs_region_alloc( + unsigned long start, + unsigned long end) +{ + struct damon_sysfs_region *region = kmalloc(sizeof(*region), + GFP_KERNEL); + + if (!region) + return NULL; + region->kobj = (struct kobject){}; + region->start = start; + region->end = end; + return region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->start); +} + +static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + int err = kstrtoul(buf, 0, ®ion->start); + + return err ? err : count; +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->end); +} + +static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_region *region = container_of(kobj, + struct damon_sysfs_region, kobj); + int err = kstrtoul(buf, 0, ®ion->end); + + return err ? err : count; +} + +static void damon_sysfs_region_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_region, kobj)); +} + +static struct kobj_attribute damon_sysfs_region_start_attr = + __ATTR_RW_MODE(start, 0600); + +static struct kobj_attribute damon_sysfs_region_end_attr = + __ATTR_RW_MODE(end, 0600); + +static struct attribute *damon_sysfs_region_attrs[] = { + &damon_sysfs_region_start_attr.attr, + &damon_sysfs_region_end_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_region); + +static struct kobj_type damon_sysfs_region_ktype = { + .release = damon_sysfs_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_region_groups, +}; + +/* + * init_regions directory + */ + +struct damon_sysfs_regions { + struct kobject kobj; + struct damon_sysfs_region **regions_arr; + int nr; +}; + +static struct damon_sysfs_regions *damon_sysfs_regions_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_regions), GFP_KERNEL); +} + +static void damon_sysfs_regions_rm_dirs(struct damon_sysfs_regions *regions) +{ + struct damon_sysfs_region **regions_arr = regions->regions_arr; + int i; + + for (i = 0; i < regions->nr; i++) + kobject_put(®ions_arr[i]->kobj); + regions->nr = 0; + kfree(regions_arr); + regions->regions_arr = NULL; +} + +static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, + int nr_regions) +{ + struct damon_sysfs_region **regions_arr, *region; + int err, i; + + damon_sysfs_regions_rm_dirs(regions); + if (!nr_regions) + return 0; + + regions_arr = kmalloc_array(nr_regions, sizeof(*regions_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!regions_arr) + return -ENOMEM; + regions->regions_arr = regions_arr; + + for (i = 0; i < nr_regions; i++) { + region = damon_sysfs_region_alloc(0, 0); + if (!region) { + damon_sysfs_regions_rm_dirs(regions); + return -ENOMEM; + } + + err = kobject_init_and_add(®ion->kobj, + &damon_sysfs_region_ktype, ®ions->kobj, + "%d", i); + if (err) { + kobject_put(®ion->kobj); + damon_sysfs_regions_rm_dirs(regions); + return err; + } + + regions_arr[i] = region; + regions->nr++; + } + return 0; +} + +static ssize_t nr_regions_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_regions *regions = container_of(kobj, + struct damon_sysfs_regions, kobj); + + return sysfs_emit(buf, "%d\n", regions->nr); +} + +static ssize_t nr_regions_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_regions *regions; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + regions = container_of(kobj, struct damon_sysfs_regions, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_regions_add_dirs(regions, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_regions, kobj)); +} + +static struct kobj_attribute damon_sysfs_regions_nr_attr = + __ATTR_RW_MODE(nr_regions, 0600); + +static struct attribute *damon_sysfs_regions_attrs[] = { + &damon_sysfs_regions_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_regions); + +static struct kobj_type damon_sysfs_regions_ktype = { + .release = damon_sysfs_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_regions_groups, +}; + +/* + * target directory + */ + +struct damon_sysfs_target { + struct kobject kobj; + struct damon_sysfs_regions *regions; + int pid; +}; + +static struct damon_sysfs_target *damon_sysfs_target_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL); +} + +static int damon_sysfs_target_add_dirs(struct damon_sysfs_target *target) +{ + struct damon_sysfs_regions *regions = damon_sysfs_regions_alloc(); + int err; + + if (!regions) + return -ENOMEM; + + err = kobject_init_and_add(®ions->kobj, &damon_sysfs_regions_ktype, + &target->kobj, "regions"); + if (err) + kobject_put(®ions->kobj); + else + target->regions = regions; + return err; +} + +static void damon_sysfs_target_rm_dirs(struct damon_sysfs_target *target) +{ + damon_sysfs_regions_rm_dirs(target->regions); + kobject_put(&target->regions->kobj); +} + +static ssize_t pid_target_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + + return sysfs_emit(buf, "%d\n", target->pid); +} + +static ssize_t pid_target_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_target *target = container_of(kobj, + struct damon_sysfs_target, kobj); + int err = kstrtoint(buf, 0, &target->pid); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_target_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_target, kobj)); +} + +static struct kobj_attribute damon_sysfs_target_pid_attr = + __ATTR_RW_MODE(pid_target, 0600); + +static struct attribute *damon_sysfs_target_attrs[] = { + &damon_sysfs_target_pid_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_target); + +static struct kobj_type damon_sysfs_target_ktype = { + .release = damon_sysfs_target_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_target_groups, +}; + +/* + * targets directory + */ + +struct damon_sysfs_targets { + struct kobject kobj; + struct damon_sysfs_target **targets_arr; + int nr; +}; + +static struct damon_sysfs_targets *damon_sysfs_targets_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_targets), GFP_KERNEL); +} + +static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets) +{ + struct damon_sysfs_target **targets_arr = targets->targets_arr; + int i; + + for (i = 0; i < targets->nr; i++) { + damon_sysfs_target_rm_dirs(targets_arr[i]); + kobject_put(&targets_arr[i]->kobj); + } + targets->nr = 0; + kfree(targets_arr); + targets->targets_arr = NULL; +} + +static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets, + int nr_targets) +{ + struct damon_sysfs_target **targets_arr, *target; + int err, i; + + damon_sysfs_targets_rm_dirs(targets); + if (!nr_targets) + return 0; + + targets_arr = kmalloc_array(nr_targets, sizeof(*targets_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!targets_arr) + return -ENOMEM; + targets->targets_arr = targets_arr; + + for (i = 0; i < nr_targets; i++) { + target = damon_sysfs_target_alloc(); + if (!target) { + damon_sysfs_targets_rm_dirs(targets); + return -ENOMEM; + } + + err = kobject_init_and_add(&target->kobj, + &damon_sysfs_target_ktype, &targets->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_target_add_dirs(target); + if (err) + goto out; + + targets_arr[i] = target; + targets->nr++; + } + return 0; + +out: + damon_sysfs_targets_rm_dirs(targets); + kobject_put(&target->kobj); + return err; +} + +static ssize_t nr_targets_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_targets *targets = container_of(kobj, + struct damon_sysfs_targets, kobj); + + return sysfs_emit(buf, "%d\n", targets->nr); +} + +static ssize_t nr_targets_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_targets *targets; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + targets = container_of(kobj, struct damon_sysfs_targets, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_targets_add_dirs(targets, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_targets_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_targets, kobj)); +} + +static struct kobj_attribute damon_sysfs_targets_nr_attr = + __ATTR_RW_MODE(nr_targets, 0600); + +static struct attribute *damon_sysfs_targets_attrs[] = { + &damon_sysfs_targets_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_targets); + +static struct kobj_type damon_sysfs_targets_ktype = { + .release = damon_sysfs_targets_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_targets_groups, +}; + +/* + * intervals directory + */ + +struct damon_sysfs_intervals { + struct kobject kobj; + unsigned long sample_us; + unsigned long aggr_us; + unsigned long update_us; +}; + +static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( + unsigned long sample_us, unsigned long aggr_us, + unsigned long update_us) +{ + struct damon_sysfs_intervals *intervals = kmalloc(sizeof(*intervals), + GFP_KERNEL); + + if (!intervals) + return NULL; + + intervals->kobj = (struct kobject){}; + intervals->sample_us = sample_us; + intervals->aggr_us = aggr_us; + intervals->update_us = update_us; + return intervals; +} + +static ssize_t sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->sample_us); +} + +static ssize_t sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return err; + + intervals->sample_us = us; + return count; +} + +static ssize_t aggr_us_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->aggr_us); +} + +static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return err; + + intervals->aggr_us = us; + return count; +} + +static ssize_t update_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + + return sysfs_emit(buf, "%lu\n", intervals->update_us); +} + +static ssize_t update_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals *intervals = container_of(kobj, + struct damon_sysfs_intervals, kobj); + unsigned long us; + int err = kstrtoul(buf, 0, &us); + + if (err) + return err; + + intervals->update_us = us; + return count; +} + +static void damon_sysfs_intervals_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_intervals, kobj)); +} + +static struct kobj_attribute damon_sysfs_intervals_sample_us_attr = + __ATTR_RW_MODE(sample_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_aggr_us_attr = + __ATTR_RW_MODE(aggr_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_update_us_attr = + __ATTR_RW_MODE(update_us, 0600); + +static struct attribute *damon_sysfs_intervals_attrs[] = { + &damon_sysfs_intervals_sample_us_attr.attr, + &damon_sysfs_intervals_aggr_us_attr.attr, + &damon_sysfs_intervals_update_us_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_intervals); + +static struct kobj_type damon_sysfs_intervals_ktype = { + .release = damon_sysfs_intervals_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_intervals_groups, +}; + +/* + * monitoring_attrs directory + */ + +struct damon_sysfs_attrs { + struct kobject kobj; + struct damon_sysfs_intervals *intervals; + struct damon_sysfs_ul_range *nr_regions_range; +}; + +static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void) +{ + struct damon_sysfs_attrs *attrs = kmalloc(sizeof(*attrs), GFP_KERNEL); + + if (!attrs) + return NULL; + attrs->kobj = (struct kobject){}; + return attrs; +} + +static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) +{ + struct damon_sysfs_intervals *intervals; + struct damon_sysfs_ul_range *nr_regions_range; + int err; + + intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000); + if (!intervals) + return -ENOMEM; + + err = kobject_init_and_add(&intervals->kobj, + &damon_sysfs_intervals_ktype, &attrs->kobj, + "intervals"); + if (err) + goto put_intervals_out; + attrs->intervals = intervals; + + nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000); + if (!nr_regions_range) { + err = -ENOMEM; + goto put_intervals_out; + } + + err = kobject_init_and_add(&nr_regions_range->kobj, + &damon_sysfs_ul_range_ktype, &attrs->kobj, + "nr_regions"); + if (err) + goto put_nr_regions_intervals_out; + attrs->nr_regions_range = nr_regions_range; + return 0; + +put_nr_regions_intervals_out: + kobject_put(&nr_regions_range->kobj); + attrs->nr_regions_range = NULL; +put_intervals_out: + kobject_put(&intervals->kobj); + attrs->intervals = NULL; + return err; +} + +static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) +{ + kobject_put(&attrs->nr_regions_range->kobj); + kobject_put(&attrs->intervals->kobj); +} + +static void damon_sysfs_attrs_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_attrs, kobj)); +} + +static struct attribute *damon_sysfs_attrs_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_attrs); + +static struct kobj_type damon_sysfs_attrs_ktype = { + .release = damon_sysfs_attrs_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_attrs_groups, +}; + +/* + * context directory + */ + +/* This should match with enum damon_ops_id */ +static const char * const damon_sysfs_ops_strs[] = { + "vaddr", + "fvaddr", + "paddr", +}; + +struct damon_sysfs_context { + struct kobject kobj; + enum damon_ops_id ops_id; + struct damon_sysfs_attrs *attrs; + struct damon_sysfs_targets *targets; + struct damon_sysfs_schemes *schemes; +}; + +static struct damon_sysfs_context *damon_sysfs_context_alloc( + enum damon_ops_id ops_id) +{ + struct damon_sysfs_context *context = kmalloc(sizeof(*context), + GFP_KERNEL); + + if (!context) + return NULL; + context->kobj = (struct kobject){}; + context->ops_id = ops_id; + return context; +} + +static int damon_sysfs_context_set_attrs(struct damon_sysfs_context *context) +{ + struct damon_sysfs_attrs *attrs = damon_sysfs_attrs_alloc(); + int err; + + if (!attrs) + return -ENOMEM; + err = kobject_init_and_add(&attrs->kobj, &damon_sysfs_attrs_ktype, + &context->kobj, "monitoring_attrs"); + if (err) + goto out; + err = damon_sysfs_attrs_add_dirs(attrs); + if (err) + goto out; + context->attrs = attrs; + return 0; + +out: + kobject_put(&attrs->kobj); + return err; +} + +static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context) +{ + struct damon_sysfs_targets *targets = damon_sysfs_targets_alloc(); + int err; + + if (!targets) + return -ENOMEM; + err = kobject_init_and_add(&targets->kobj, &damon_sysfs_targets_ktype, + &context->kobj, "targets"); + if (err) { + kobject_put(&targets->kobj); + return err; + } + context->targets = targets; + return 0; +} + +static int damon_sysfs_context_set_schemes(struct damon_sysfs_context *context) +{ + struct damon_sysfs_schemes *schemes = damon_sysfs_schemes_alloc(); + int err; + + if (!schemes) + return -ENOMEM; + err = kobject_init_and_add(&schemes->kobj, &damon_sysfs_schemes_ktype, + &context->kobj, "schemes"); + if (err) { + kobject_put(&schemes->kobj); + return err; + } + context->schemes = schemes; + return 0; +} + +static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) +{ + int err; + + err = damon_sysfs_context_set_attrs(context); + if (err) + return err; + + err = damon_sysfs_context_set_targets(context); + if (err) + goto put_attrs_out; + + err = damon_sysfs_context_set_schemes(context); + if (err) + goto put_targets_attrs_out; + return 0; + +put_targets_attrs_out: + kobject_put(&context->targets->kobj); + context->targets = NULL; +put_attrs_out: + kobject_put(&context->attrs->kobj); + context->attrs = NULL; + return err; +} + +static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context) +{ + damon_sysfs_attrs_rm_dirs(context->attrs); + kobject_put(&context->attrs->kobj); + damon_sysfs_targets_rm_dirs(context->targets); + kobject_put(&context->targets->kobj); + damon_sysfs_schemes_rm_dirs(context->schemes); + kobject_put(&context->schemes->kobj); +} + +static ssize_t avail_operations_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + enum damon_ops_id id; + int len = 0; + + for (id = 0; id < NR_DAMON_OPS; id++) { + if (!damon_is_registered_ops(id)) + continue; + len += sysfs_emit_at(buf, len, "%s\n", + damon_sysfs_ops_strs[id]); + } + return len; +} + +static ssize_t operations_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]); +} + +static ssize_t operations_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + enum damon_ops_id id; + + for (id = 0; id < NR_DAMON_OPS; id++) { + if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) { + context->ops_id = id; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_context_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_context, kobj)); +} + +static struct kobj_attribute damon_sysfs_context_avail_operations_attr = + __ATTR_RO_MODE(avail_operations, 0400); + +static struct kobj_attribute damon_sysfs_context_operations_attr = + __ATTR_RW_MODE(operations, 0600); + +static struct attribute *damon_sysfs_context_attrs[] = { + &damon_sysfs_context_avail_operations_attr.attr, + &damon_sysfs_context_operations_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_context); + +static struct kobj_type damon_sysfs_context_ktype = { + .release = damon_sysfs_context_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_context_groups, +}; + +/* + * contexts directory + */ + +struct damon_sysfs_contexts { + struct kobject kobj; + struct damon_sysfs_context **contexts_arr; + int nr; +}; + +static struct damon_sysfs_contexts *damon_sysfs_contexts_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_contexts), GFP_KERNEL); +} + +static void damon_sysfs_contexts_rm_dirs(struct damon_sysfs_contexts *contexts) +{ + struct damon_sysfs_context **contexts_arr = contexts->contexts_arr; + int i; + + for (i = 0; i < contexts->nr; i++) { + damon_sysfs_context_rm_dirs(contexts_arr[i]); + kobject_put(&contexts_arr[i]->kobj); + } + contexts->nr = 0; + kfree(contexts_arr); + contexts->contexts_arr = NULL; +} + +static int damon_sysfs_contexts_add_dirs(struct damon_sysfs_contexts *contexts, + int nr_contexts) +{ + struct damon_sysfs_context **contexts_arr, *context; + int err, i; + + damon_sysfs_contexts_rm_dirs(contexts); + if (!nr_contexts) + return 0; + + contexts_arr = kmalloc_array(nr_contexts, sizeof(*contexts_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!contexts_arr) + return -ENOMEM; + contexts->contexts_arr = contexts_arr; + + for (i = 0; i < nr_contexts; i++) { + context = damon_sysfs_context_alloc(DAMON_OPS_VADDR); + if (!context) { + damon_sysfs_contexts_rm_dirs(contexts); + return -ENOMEM; + } + + err = kobject_init_and_add(&context->kobj, + &damon_sysfs_context_ktype, &contexts->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_context_add_dirs(context); + if (err) + goto out; + + contexts_arr[i] = context; + contexts->nr++; + } + return 0; + +out: + damon_sysfs_contexts_rm_dirs(contexts); + kobject_put(&context->kobj); + return err; +} + +static ssize_t nr_contexts_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_contexts *contexts = container_of(kobj, + struct damon_sysfs_contexts, kobj); + + return sysfs_emit(buf, "%d\n", contexts->nr); +} + +static ssize_t nr_contexts_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_contexts *contexts; + int nr, err; + + err = kstrtoint(buf, 0, &nr); + if (err) + return err; + /* TODO: support multiple contexts per kdamond */ + if (nr < 0 || 1 < nr) + return -EINVAL; + + contexts = container_of(kobj, struct damon_sysfs_contexts, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_contexts_add_dirs(contexts, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_contexts_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_contexts, kobj)); +} + +static struct kobj_attribute damon_sysfs_contexts_nr_attr + = __ATTR_RW_MODE(nr_contexts, 0600); + +static struct attribute *damon_sysfs_contexts_attrs[] = { + &damon_sysfs_contexts_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_contexts); + +static struct kobj_type damon_sysfs_contexts_ktype = { + .release = damon_sysfs_contexts_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_contexts_groups, +}; + +/* + * kdamond directory + */ + +struct damon_sysfs_kdamond { + struct kobject kobj; + struct damon_sysfs_contexts *contexts; + struct damon_ctx *damon_ctx; +}; + +static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_kdamond), GFP_KERNEL); +} + +static int damon_sysfs_kdamond_add_dirs(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_sysfs_contexts *contexts; + int err; + + contexts = damon_sysfs_contexts_alloc(); + if (!contexts) + return -ENOMEM; + + err = kobject_init_and_add(&contexts->kobj, + &damon_sysfs_contexts_ktype, &kdamond->kobj, + "contexts"); + if (err) { + kobject_put(&contexts->kobj); + return err; + } + kdamond->contexts = contexts; + + return err; +} + +static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond) +{ + damon_sysfs_contexts_rm_dirs(kdamond->contexts); + kobject_put(&kdamond->contexts->kobj); +} + +static bool damon_sysfs_ctx_running(struct damon_ctx *ctx) +{ + bool running; + + mutex_lock(&ctx->kdamond_lock); + running = ctx->kdamond != NULL; + mutex_unlock(&ctx->kdamond_lock); + return running; +} + +/* + * enum damon_sysfs_cmd - Commands for a specific kdamond. + */ +enum damon_sysfs_cmd { + /* @DAMON_SYSFS_CMD_ON: Turn the kdamond on. */ + DAMON_SYSFS_CMD_ON, + /* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */ + DAMON_SYSFS_CMD_OFF, + /* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */ + DAMON_SYSFS_CMD_COMMIT, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs + * files. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. + */ + NR_DAMON_SYSFS_CMDS, +}; + +/* Should match with enum damon_sysfs_cmd */ +static const char * const damon_sysfs_cmd_strs[] = { + "on", + "off", + "commit", + "update_schemes_stats", +}; + +/* + * struct damon_sysfs_cmd_request - A request to the DAMON callback. + * @cmd: The command that needs to be handled by the callback. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This structure represents a sysfs command request that need to access some + * DAMON context-internal data. Because DAMON context-internal data can be + * safely accessed from DAMON callbacks without additional synchronization, the + * request will be handled by the DAMON callback. None-``NULL`` @kdamond means + * the request is valid. + */ +struct damon_sysfs_cmd_request { + enum damon_sysfs_cmd cmd; + struct damon_sysfs_kdamond *kdamond; +}; + +/* Current DAMON callback request. Protected by damon_sysfs_lock. */ +static struct damon_sysfs_cmd_request damon_sysfs_cmd_request; + +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + struct damon_ctx *ctx = kdamond->damon_ctx; + bool running; + + if (!ctx) + running = false; + else + running = damon_sysfs_ctx_running(ctx); + + return sysfs_emit(buf, "%s\n", running ? + damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] : + damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]); +} + +static int damon_sysfs_set_attrs(struct damon_ctx *ctx, + struct damon_sysfs_attrs *sys_attrs) +{ + struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; + struct damon_sysfs_ul_range *sys_nr_regions = + sys_attrs->nr_regions_range; + struct damon_attrs attrs = { + .sample_interval = sys_intervals->sample_us, + .aggr_interval = sys_intervals->aggr_us, + .ops_update_interval = sys_intervals->update_us, + .min_nr_regions = sys_nr_regions->min, + .max_nr_regions = sys_nr_regions->max, + }; + return damon_set_attrs(ctx, &attrs); +} + +static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + bool has_pid = damon_target_has_pid(ctx); + + damon_for_each_target_safe(t, next, ctx) { + if (has_pid) + put_pid(t->pid); + damon_destroy_target(t); + } +} + +static int damon_sysfs_set_regions(struct damon_target *t, + struct damon_sysfs_regions *sysfs_regions) +{ + struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, + sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); + int i, err = -EINVAL; + + if (!ranges) + return -ENOMEM; + for (i = 0; i < sysfs_regions->nr; i++) { + struct damon_sysfs_region *sys_region = + sysfs_regions->regions_arr[i]; + + if (sys_region->start > sys_region->end) + goto out; + + ranges[i].start = sys_region->start; + ranges[i].end = sys_region->end; + if (i == 0) + continue; + if (ranges[i - 1].end > ranges[i].start) + goto out; + } + err = damon_set_regions(t, ranges, sysfs_regions->nr); +out: + kfree(ranges); + return err; + +} + +static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, + struct damon_ctx *ctx) +{ + struct damon_target *t = damon_new_target(); + int err = -EINVAL; + + if (!t) + return -ENOMEM; + damon_add_target(ctx, t); + if (damon_target_has_pid(ctx)) { + t->pid = find_get_pid(sys_target->pid); + if (!t->pid) + goto destroy_targets_out; + } + err = damon_sysfs_set_regions(t, sys_target->regions); + if (err) + goto destroy_targets_out; + return 0; + +destroy_targets_out: + damon_sysfs_destroy_targets(ctx); + return err; +} + +static int damon_sysfs_update_target_pid(struct damon_target *target, int pid) +{ + struct pid *pid_new; + + pid_new = find_get_pid(pid); + if (!pid_new) + return -EINVAL; + + if (pid_new == target->pid) { + put_pid(pid_new); + return 0; + } + + put_pid(target->pid); + target->pid = pid_new; + return 0; +} + +static int damon_sysfs_update_target(struct damon_target *target, + struct damon_ctx *ctx, + struct damon_sysfs_target *sys_target) +{ + int err = 0; + + if (damon_target_has_pid(ctx)) { + err = damon_sysfs_update_target_pid(target, sys_target->pid); + if (err) + return err; + } + + /* + * Do monitoring target region boundary update only if one or more + * regions are set by the user. This is for keeping current monitoring + * target results and range easier, especially for dynamic monitoring + * target regions update ops like 'vaddr'. + */ + if (sys_target->regions->nr) + err = damon_sysfs_set_regions(target, sys_target->regions); + return err; +} + +static int damon_sysfs_set_targets(struct damon_ctx *ctx, + struct damon_sysfs_targets *sysfs_targets) +{ + struct damon_target *t, *next; + int i = 0, err; + + /* Multiple physical address space monitoring targets makes no sense */ + if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1) + return -EINVAL; + + damon_for_each_target_safe(t, next, ctx) { + if (i < sysfs_targets->nr) { + err = damon_sysfs_update_target(t, ctx, + sysfs_targets->targets_arr[i]); + if (err) + return err; + } else { + if (damon_target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + i++; + } + + for (; i < sysfs_targets->nr; i++) { + struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; + + err = damon_sysfs_add_target(st, ctx); + if (err) + return err; + } + return 0; +} + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); +} + +static void damon_sysfs_update_scheme(struct damos *scheme, + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + scheme->pattern.min_sz_region = access_pattern->sz->min; + scheme->pattern.max_sz_region = access_pattern->sz->max; + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; + scheme->pattern.min_age_region = access_pattern->age->min; + scheme->pattern.max_age_region = access_pattern->age->max; + + scheme->action = sysfs_scheme->action; + + scheme->quota.ms = sysfs_quotas->ms; + scheme->quota.sz = sysfs_quotas->sz; + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; + scheme->quota.weight_sz = sysfs_weights->sz; + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; + scheme->quota.weight_age = sysfs_weights->age; + + scheme->wmarks.metric = sysfs_wmarks->metric; + scheme->wmarks.interval = sysfs_wmarks->interval_us; + scheme->wmarks.high = sysfs_wmarks->high; + scheme->wmarks.mid = sysfs_wmarks->mid; + scheme->wmarks.low = sysfs_wmarks->low; +} + +static int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + struct damos *scheme, *next; + int i = 0; + + damon_for_each_scheme_safe(scheme, next, ctx) { + if (i < sysfs_schemes->nr) + damon_sysfs_update_scheme(scheme, + sysfs_schemes->schemes_arr[i]); + else + damon_destroy_scheme(scheme); + i++; + } + + for (; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +static void damon_sysfs_before_terminate(struct damon_ctx *ctx) +{ + struct damon_target *t, *next; + + if (!damon_target_has_pid(ctx)) + return; + + mutex_lock(&ctx->kdamond_lock); + damon_for_each_target_safe(t, next, ctx) { + put_pid(t->pid); + damon_destroy_target(t); + } + mutex_unlock(&ctx->kdamond_lock); +} + +/* + * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes stats of specific kdamond and update the + * related values for sysfs files. This function should be called from DAMON + * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON + * contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + struct damon_sysfs_schemes *sysfs_schemes; + struct damos *scheme; + int schemes_idx = 0; + + if (!ctx) + return -EINVAL; + sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } + return 0; +} + +static inline bool damon_sysfs_kdamond_running( + struct damon_sysfs_kdamond *kdamond) +{ + return kdamond->damon_ctx && + damon_sysfs_ctx_running(kdamond->damon_ctx); +} + +static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, + struct damon_sysfs_context *sys_ctx) +{ + int err; + + err = damon_select_ops(ctx, sys_ctx->ops_id); + if (err) + return err; + err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); + if (err) + return err; + err = damon_sysfs_set_targets(ctx, sys_ctx->targets); + if (err) + return err; + return damon_sysfs_set_schemes(ctx, sys_ctx->schemes); +} + +/* + * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * If the sysfs input is wrong, the kdamond will be terminated. + */ +static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +{ + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + /* TODO: Support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + return damon_sysfs_apply_inputs(kdamond->damon_ctx, + kdamond->contexts->contexts_arr[0]); +} + +/* + * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. + * @c: The DAMON context of the callback. + * + * This function is periodically called back from the kdamond thread for @c. + * Then, it checks if there is a waiting DAMON sysfs request and handles it. + */ +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) +{ + struct damon_sysfs_kdamond *kdamond; + int err = 0; + + /* avoid deadlock due to concurrent state_store('off') */ + if (!mutex_trylock(&damon_sysfs_lock)) + return 0; + kdamond = damon_sysfs_cmd_request.kdamond; + if (!kdamond || kdamond->damon_ctx != c) + goto out; + switch (damon_sysfs_cmd_request.cmd) { + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: + err = damon_sysfs_upd_schemes_stats(kdamond); + break; + case DAMON_SYSFS_CMD_COMMIT: + err = damon_sysfs_commit_input(kdamond); + break; + default: + break; + } + /* Mark the request as invalid now. */ + damon_sysfs_cmd_request.kdamond = NULL; +out: + mutex_unlock(&damon_sysfs_lock); + return err; +} + +static struct damon_ctx *damon_sysfs_build_ctx( + struct damon_sysfs_context *sys_ctx) +{ + struct damon_ctx *ctx = damon_new_ctx(); + int err; + + if (!ctx) + return ERR_PTR(-ENOMEM); + + err = damon_sysfs_apply_inputs(ctx, sys_ctx); + if (err) { + damon_destroy_ctx(ctx); + return ERR_PTR(err); + } + + ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; + ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; + ctx->callback.before_terminate = damon_sysfs_before_terminate; + return ctx; +} + +static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx; + int err; + + if (damon_sysfs_kdamond_running(kdamond)) + return -EBUSY; + if (damon_sysfs_cmd_request.kdamond == kdamond) + return -EBUSY; + /* TODO: support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + if (kdamond->damon_ctx) + damon_destroy_ctx(kdamond->damon_ctx); + kdamond->damon_ctx = NULL; + + ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + err = damon_start(&ctx, 1, false); + if (err) { + damon_destroy_ctx(ctx); + return err; + } + kdamond->damon_ctx = ctx; + return err; +} + +static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond) +{ + if (!kdamond->damon_ctx) + return -EINVAL; + return damon_stop(&kdamond->damon_ctx, 1); + /* + * To allow users show final monitoring results of already turned-off + * DAMON, we free kdamond->damon_ctx in next + * damon_sysfs_turn_damon_on(), or kdamonds_nr_store() + */ +} + +/* + * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond. + * @cmd: The command to handle. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * This function handles a DAMON sysfs command for a kdamond. For commands + * that need to access running DAMON context-internal data, it requests + * handling of the command to the DAMON callback + * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled, + * or the context is completed. + * + * Return: 0 on success, negative error code otherwise. + */ +static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, + struct damon_sysfs_kdamond *kdamond) +{ + bool need_wait = true; + + /* Handle commands that doesn't access DAMON context-internal data */ + switch (cmd) { + case DAMON_SYSFS_CMD_ON: + return damon_sysfs_turn_damon_on(kdamond); + case DAMON_SYSFS_CMD_OFF: + return damon_sysfs_turn_damon_off(kdamond); + default: + break; + } + + /* Pass the command to DAMON callback for safe DAMON context access */ + if (damon_sysfs_cmd_request.kdamond) + return -EBUSY; + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + damon_sysfs_cmd_request.cmd = cmd; + damon_sysfs_cmd_request.kdamond = kdamond; + + /* + * wait until damon_sysfs_cmd_request_callback() handles the request + * from kdamond context + */ + mutex_unlock(&damon_sysfs_lock); + while (need_wait) { + schedule_timeout_idle(msecs_to_jiffies(100)); + if (!mutex_trylock(&damon_sysfs_lock)) + continue; + if (!damon_sysfs_cmd_request.kdamond) { + /* damon_sysfs_cmd_request_callback() handled */ + need_wait = false; + } else if (!damon_sysfs_kdamond_running(kdamond)) { + /* kdamond has already finished */ + need_wait = false; + damon_sysfs_cmd_request.kdamond = NULL; + } + mutex_unlock(&damon_sysfs_lock); + } + mutex_lock(&damon_sysfs_lock); + return 0; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + enum damon_sysfs_cmd cmd; + ssize_t ret = -EINVAL; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + for (cmd = 0; cmd < NR_DAMON_SYSFS_CMDS; cmd++) { + if (sysfs_streq(buf, damon_sysfs_cmd_strs[cmd])) { + ret = damon_sysfs_handle_cmd(cmd, kdamond); + break; + } + } + mutex_unlock(&damon_sysfs_lock); + if (!ret) + ret = count; + return ret; +} + +static ssize_t pid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + struct damon_ctx *ctx; + int pid = -1; + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + ctx = kdamond->damon_ctx; + if (!ctx) + goto out; + + mutex_lock(&ctx->kdamond_lock); + if (ctx->kdamond) + pid = ctx->kdamond->pid; + mutex_unlock(&ctx->kdamond_lock); +out: + mutex_unlock(&damon_sysfs_lock); + return sysfs_emit(buf, "%d\n", pid); +} + +static void damon_sysfs_kdamond_release(struct kobject *kobj) +{ + struct damon_sysfs_kdamond *kdamond = container_of(kobj, + struct damon_sysfs_kdamond, kobj); + + if (kdamond->damon_ctx) + damon_destroy_ctx(kdamond->damon_ctx); + kfree(kdamond); +} + +static struct kobj_attribute damon_sysfs_kdamond_state_attr = + __ATTR_RW_MODE(state, 0600); + +static struct kobj_attribute damon_sysfs_kdamond_pid_attr = + __ATTR_RO_MODE(pid, 0400); + +static struct attribute *damon_sysfs_kdamond_attrs[] = { + &damon_sysfs_kdamond_state_attr.attr, + &damon_sysfs_kdamond_pid_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_kdamond); + +static struct kobj_type damon_sysfs_kdamond_ktype = { + .release = damon_sysfs_kdamond_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_kdamond_groups, +}; + +/* + * kdamonds directory + */ + +struct damon_sysfs_kdamonds { + struct kobject kobj; + struct damon_sysfs_kdamond **kdamonds_arr; + int nr; +}; + +static struct damon_sysfs_kdamonds *damon_sysfs_kdamonds_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_kdamonds), GFP_KERNEL); +} + +static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) +{ + struct damon_sysfs_kdamond **kdamonds_arr = kdamonds->kdamonds_arr; + int i; + + for (i = 0; i < kdamonds->nr; i++) { + damon_sysfs_kdamond_rm_dirs(kdamonds_arr[i]); + kobject_put(&kdamonds_arr[i]->kobj); + } + kdamonds->nr = 0; + kfree(kdamonds_arr); + kdamonds->kdamonds_arr = NULL; +} + +static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, + int nr_kdamonds) +{ + int i; + + for (i = 0; i < nr_kdamonds; i++) { + if (damon_sysfs_kdamond_running(kdamonds[i]) || + damon_sysfs_cmd_request.kdamond == kdamonds[i]) + return true; + } + + return false; +} + +static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, + int nr_kdamonds) +{ + struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; + int err, i; + + if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr)) + return -EBUSY; + + damon_sysfs_kdamonds_rm_dirs(kdamonds); + if (!nr_kdamonds) + return 0; + + kdamonds_arr = kmalloc_array(nr_kdamonds, sizeof(*kdamonds_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!kdamonds_arr) + return -ENOMEM; + kdamonds->kdamonds_arr = kdamonds_arr; + + for (i = 0; i < nr_kdamonds; i++) { + kdamond = damon_sysfs_kdamond_alloc(); + if (!kdamond) { + damon_sysfs_kdamonds_rm_dirs(kdamonds); + return -ENOMEM; + } + + err = kobject_init_and_add(&kdamond->kobj, + &damon_sysfs_kdamond_ktype, &kdamonds->kobj, + "%d", i); + if (err) + goto out; + + err = damon_sysfs_kdamond_add_dirs(kdamond); + if (err) + goto out; + + kdamonds_arr[i] = kdamond; + kdamonds->nr++; + } + return 0; + +out: + damon_sysfs_kdamonds_rm_dirs(kdamonds); + kobject_put(&kdamond->kobj); + return err; +} + +static ssize_t nr_kdamonds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, + struct damon_sysfs_kdamonds, kobj); + + return sysfs_emit(buf, "%d\n", kdamonds->nr); +} + +static ssize_t nr_kdamonds_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_kdamonds *kdamonds; + int nr, err; + + err = kstrtoint(buf, 0, &nr); + if (err) + return err; + if (nr < 0) + return -EINVAL; + + kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + + return count; +} + +static void damon_sysfs_kdamonds_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_kdamonds, kobj)); +} + +static struct kobj_attribute damon_sysfs_kdamonds_nr_attr = + __ATTR_RW_MODE(nr_kdamonds, 0600); + +static struct attribute *damon_sysfs_kdamonds_attrs[] = { + &damon_sysfs_kdamonds_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_kdamonds); + +static struct kobj_type damon_sysfs_kdamonds_ktype = { + .release = damon_sysfs_kdamonds_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_kdamonds_groups, +}; + +/* + * damon user interface directory + */ + +struct damon_sysfs_ui_dir { + struct kobject kobj; + struct damon_sysfs_kdamonds *kdamonds; +}; + +static struct damon_sysfs_ui_dir *damon_sysfs_ui_dir_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_ui_dir), GFP_KERNEL); +} + +static int damon_sysfs_ui_dir_add_dirs(struct damon_sysfs_ui_dir *ui_dir) +{ + struct damon_sysfs_kdamonds *kdamonds; + int err; + + kdamonds = damon_sysfs_kdamonds_alloc(); + if (!kdamonds) + return -ENOMEM; + + err = kobject_init_and_add(&kdamonds->kobj, + &damon_sysfs_kdamonds_ktype, &ui_dir->kobj, + "kdamonds"); + if (err) { + kobject_put(&kdamonds->kobj); + return err; + } + ui_dir->kdamonds = kdamonds; + return err; +} + +static void damon_sysfs_ui_dir_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ui_dir, kobj)); +} + +static struct attribute *damon_sysfs_ui_dir_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ui_dir); + +static struct kobj_type damon_sysfs_ui_dir_ktype = { + .release = damon_sysfs_ui_dir_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ui_dir_groups, +}; + +static int __init damon_sysfs_init(void) +{ + struct kobject *damon_sysfs_root; + struct damon_sysfs_ui_dir *admin; + int err; + + damon_sysfs_root = kobject_create_and_add("damon", mm_kobj); + if (!damon_sysfs_root) + return -ENOMEM; + + admin = damon_sysfs_ui_dir_alloc(); + if (!admin) { + kobject_put(damon_sysfs_root); + return -ENOMEM; + } + err = kobject_init_and_add(&admin->kobj, &damon_sysfs_ui_dir_ktype, + damon_sysfs_root, "admin"); + if (err) + goto out; + err = damon_sysfs_ui_dir_add_dirs(admin); + if (err) + goto out; + return 0; + +out: + kobject_put(&admin->kobj); + kobject_put(damon_sysfs_root); + return err; +} +subsys_initcall(damon_sysfs_init); diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h new file mode 100644 index 000000000..e939598af --- /dev/null +++ b/mm/damon/vaddr-test.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST + +#ifndef _DAMON_VADDR_TEST_H +#define _DAMON_VADDR_TEST_H + +#include + +static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, + ssize_t nr_vmas) +{ + int i; + MA_STATE(mas, mt, 0, 0); + + if (!nr_vmas) + return; + + mas_lock(&mas); + for (i = 0; i < nr_vmas; i++) + vma_mas_store(&vmas[i], &mas); + mas_unlock(&mas); +} + +/* + * Test __damon_va_three_regions() function + * + * In case of virtual memory address spaces monitoring, DAMON converts the + * complex and dynamic memory mappings of each target task to three + * discontiguous regions which cover every mapped areas. However, the three + * regions should not include the two biggest unmapped areas in the original + * mapping, because the two biggest areas are normally the areas between 1) + * heap and the mmap()-ed regions, and 2) the mmap()-ed regions and stack. + * Because these two unmapped areas are very huge but obviously never accessed, + * covering the region is just a waste. + * + * '__damon_va_three_regions() receives an address space of a process. It + * first identifies the start of mappings, end of mappings, and the two biggest + * unmapped areas. After that, based on the information, it constructs the + * three regions and returns. For more detail, refer to the comment of + * 'damon_init_regions_of()' function definition in 'mm/damon.c' file. + * + * For example, suppose virtual address ranges of 10-20, 20-25, 200-210, + * 210-220, 300-305, and 307-330 (Other comments represent this mappings in + * more short form: 10-20-25, 200-210-220, 300-305, 307-330) of a process are + * mapped. To cover every mappings, the three regions should start with 10, + * and end with 305. The process also has three unmapped areas, 25-200, + * 220-300, and 305-307. Among those, 25-200 and 220-300 are the biggest two + * unmapped areas, and thus it should be converted to three regions of 10-25, + * 200-220, and 300-330. + */ +static void damon_test_three_regions_in_vmas(struct kunit *test) +{ + static struct mm_struct mm; + struct damon_addr_range regions[3] = {0,}; + /* 10-20-25, 200-210-220, 300-305, 307-330 */ + struct vm_area_struct vmas[] = { + (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, + (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, + (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, + (struct vm_area_struct) {.vm_start = 210, .vm_end = 220}, + (struct vm_area_struct) {.vm_start = 300, .vm_end = 305}, + (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, + }; + + mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); + __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); + + __damon_va_three_regions(&mm, regions); + + KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); + KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); + KUNIT_EXPECT_EQ(test, 200ul, regions[1].start); + KUNIT_EXPECT_EQ(test, 220ul, regions[1].end); + KUNIT_EXPECT_EQ(test, 300ul, regions[2].start); + KUNIT_EXPECT_EQ(test, 330ul, regions[2].end); +} + +static struct damon_region *__nth_region_of(struct damon_target *t, int idx) +{ + struct damon_region *r; + unsigned int i = 0; + + damon_for_each_region(r, t) { + if (i++ == idx) + return r; + } + + return NULL; +} + +/* + * Test 'damon_set_regions()' + * + * test kunit object + * regions an array containing start/end addresses of current + * monitoring target regions + * nr_regions the number of the addresses in 'regions' + * three_regions The three regions that need to be applied now + * expected start/end addresses of monitoring target regions that + * 'three_regions' are applied + * nr_expected the number of addresses in 'expected' + * + * The memory mapping of the target processes changes dynamically. To follow + * the change, DAMON periodically reads the mappings, simplifies it to the + * three regions, and updates the monitoring target regions to fit in the three + * regions. The update of current target regions is the role of + * 'damon_set_regions()'. + * + * This test passes the given target regions and the new three regions that + * need to be applied to the function and check whether it updates the regions + * as expected. + */ +static void damon_do_test_apply_three_regions(struct kunit *test, + unsigned long *regions, int nr_regions, + struct damon_addr_range *three_regions, + unsigned long *expected, int nr_expected) +{ + struct damon_target *t; + struct damon_region *r; + int i; + + t = damon_new_target(); + for (i = 0; i < nr_regions / 2; i++) { + r = damon_new_region(regions[i * 2], regions[i * 2 + 1]); + damon_add_region(r, t); + } + + damon_set_regions(t, three_regions, 3); + + for (i = 0; i < nr_expected / 2; i++) { + r = __nth_region_of(t, i); + KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); + KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); + } + + damon_destroy_target(t); +} + +/* + * This function test most common case where the three big regions are only + * slightly changed. Target regions should adjust their boundary (10-20-30, + * 50-55, 70-80, 90-100) to fit with the new big regions or remove target + * regions (57-79) that now out of the three regions. + */ +static void damon_test_apply_three_regions1(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 45-55, 73-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 45, .end = 55}, + (struct damon_addr_range){.start = 73, .end = 104} }; + /* 5-20-27, 45-55, 73-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 45, 55, + 73, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test slightly bigger change. Similar to above, but the second big region + * now require two target regions (50-55, 57-59) to be removed. + */ +static void damon_test_apply_three_regions2(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 56-57, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 56, .end = 57}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 56-57, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 56, 57, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test a big change. The second big region has totally freed and mapped to + * different area (50-59 -> 61-63). The target regions which were in the old + * second big region (50-55-57-59) should be removed and new target region + * covering the second big region (61-63) should be created. + */ +static void damon_test_apply_three_regions3(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-27, 61-63, 65-104 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 27}, + (struct damon_addr_range){.start = 61, .end = 63}, + (struct damon_addr_range){.start = 65, .end = 104} }; + /* 5-20-27, 61-63, 65-80-90-104 */ + unsigned long expected[] = {5, 20, 20, 27, 61, 63, + 65, 80, 80, 90, 90, 104}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +/* + * Test another big change. Both of the second and third big regions (50-59 + * and 70-100) has totally freed and mapped to different area (30-32 and + * 65-68). The target regions which were in the old second and third big + * regions should now be removed and new target regions covering the new second + * and third big regions should be created. + */ +static void damon_test_apply_three_regions4(struct kunit *test) +{ + /* 10-20-30, 50-55-57-59, 70-80-90-100 */ + unsigned long regions[] = {10, 20, 20, 30, 50, 55, 55, 57, 57, 59, + 70, 80, 80, 90, 90, 100}; + /* 5-7, 30-32, 65-68 */ + struct damon_addr_range new_three_regions[3] = { + (struct damon_addr_range){.start = 5, .end = 7}, + (struct damon_addr_range){.start = 30, .end = 32}, + (struct damon_addr_range){.start = 65, .end = 68} }; + /* expect 5-7, 30-32, 65-68 */ + unsigned long expected[] = {5, 7, 30, 32, 65, 68}; + + damon_do_test_apply_three_regions(test, regions, ARRAY_SIZE(regions), + new_three_regions, expected, ARRAY_SIZE(expected)); +} + +static void damon_test_split_evenly_fail(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r = damon_new_region(start, end); + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u); + + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, start); + KUNIT_EXPECT_EQ(test, r->ar.end, end); + } + + damon_free_target(t); +} + +static void damon_test_split_evenly_succ(struct kunit *test, + unsigned long start, unsigned long end, unsigned int nr_pieces) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r = damon_new_region(start, end); + unsigned long expected_width = (end - start) / nr_pieces; + unsigned long i = 0; + + damon_add_region(r, t); + KUNIT_EXPECT_EQ(test, + damon_va_evenly_split_region(t, r, nr_pieces), 0); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces); + + damon_for_each_region(r, t) { + if (i == nr_pieces - 1) { + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, end); + break; + } + KUNIT_EXPECT_EQ(test, + r->ar.start, start + i++ * expected_width); + KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width); + } + damon_free_target(t); +} + +static void damon_test_split_evenly(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5), + -EINVAL); + + damon_test_split_evenly_fail(test, 0, 100, 0); + damon_test_split_evenly_succ(test, 0, 100, 10); + damon_test_split_evenly_succ(test, 5, 59, 5); + damon_test_split_evenly_fail(test, 5, 6, 2); +} + +static struct kunit_case damon_test_cases[] = { + KUNIT_CASE(damon_test_three_regions_in_vmas), + KUNIT_CASE(damon_test_apply_three_regions1), + KUNIT_CASE(damon_test_apply_three_regions2), + KUNIT_CASE(damon_test_apply_three_regions3), + KUNIT_CASE(damon_test_apply_three_regions4), + KUNIT_CASE(damon_test_split_evenly), + {}, +}; + +static struct kunit_suite damon_test_suite = { + .name = "damon-operations", + .test_cases = damon_test_cases, +}; +kunit_test_suite(damon_test_suite); + +#endif /* _DAMON_VADDR_TEST_H */ + +#endif /* CONFIG_DAMON_VADDR_KUNIT_TEST */ diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c new file mode 100644 index 000000000..26d561af7 --- /dev/null +++ b/mm/damon/vaddr.c @@ -0,0 +1,715 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON Primitives for Virtual Address Spaces + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-va: " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "ops-common.h" + +#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST +#undef DAMON_MIN_REGION +#define DAMON_MIN_REGION 1 +#endif + +/* + * 't->pid' should be the pointer to the relevant 'struct pid' having reference + * count. Caller must put the returned task, unless it is NULL. + */ +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task(t->pid, PIDTYPE_PID); +} + +/* + * Get the mm_struct of the given target + * + * Caller _must_ put the mm_struct after use, unless it is NULL. + * + * Returns the mm_struct of the target on success, NULL on failure + */ +static struct mm_struct *damon_get_mm(struct damon_target *t) +{ + struct task_struct *task; + struct mm_struct *mm; + + task = damon_get_task_struct(t); + if (!task) + return NULL; + + mm = get_task_mm(task); + put_task_struct(task); + return mm; +} + +/* + * Functions for the initial monitoring target regions construction + */ + +/* + * Size-evenly split a region into 'nr_pieces' small regions + * + * Returns 0 on success, or negative error code otherwise. + */ +static int damon_va_evenly_split_region(struct damon_target *t, + struct damon_region *r, unsigned int nr_pieces) +{ + unsigned long sz_orig, sz_piece, orig_end; + struct damon_region *n = NULL, *next; + unsigned long start; + + if (!r || !nr_pieces) + return -EINVAL; + + orig_end = r->ar.end; + sz_orig = damon_sz_region(r); + sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); + + if (!sz_piece) + return -EINVAL; + + r->ar.end = r->ar.start + sz_piece; + next = damon_next_region(r); + for (start = r->ar.end; start + sz_piece <= orig_end; + start += sz_piece) { + n = damon_new_region(start, start + sz_piece); + if (!n) + return -ENOMEM; + damon_insert_region(n, r, next, t); + r = n; + } + /* complement last region for possible rounding error */ + if (n) + n->ar.end = orig_end; + + return 0; +} + +static unsigned long sz_range(struct damon_addr_range *r) +{ + return r->end - r->start; +} + +/* + * Find three regions separated by two biggest unmapped regions + * + * vma the head vma of the target address space + * regions an array of three address ranges that results will be saved + * + * This function receives an address space and finds three regions in it which + * separated by the two biggest unmapped regions in the space. Please refer to + * below comments of '__damon_va_init_regions()' function to know why this is + * necessary. + * + * Returns 0 if success, or negative error code otherwise. + */ +static int __damon_va_three_regions(struct mm_struct *mm, + struct damon_addr_range regions[3]) +{ + struct damon_addr_range first_gap = {0}, second_gap = {0}; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma, *prev = NULL; + unsigned long start; + + /* + * Find the two biggest gaps so that first_gap > second_gap > others. + * If this is too slow, it can be optimised to examine the maple + * tree gaps. + */ + for_each_vma(vmi, vma) { + unsigned long gap; + + if (!prev) { + start = vma->vm_start; + goto next; + } + gap = vma->vm_start - prev->vm_end; + + if (gap > sz_range(&first_gap)) { + second_gap = first_gap; + first_gap.start = prev->vm_end; + first_gap.end = vma->vm_start; + } else if (gap > sz_range(&second_gap)) { + second_gap.start = prev->vm_end; + second_gap.end = vma->vm_start; + } +next: + prev = vma; + } + + if (!sz_range(&second_gap) || !sz_range(&first_gap)) + return -EINVAL; + + /* Sort the two biggest gaps by address */ + if (first_gap.start > second_gap.start) + swap(first_gap, second_gap); + + /* Store the result */ + regions[0].start = ALIGN(start, DAMON_MIN_REGION); + regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); + regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); + regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); + regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); + regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); + + return 0; +} + +/* + * Get the three regions in the given target (task) + * + * Returns 0 on success, negative error code otherwise. + */ +static int damon_va_three_regions(struct damon_target *t, + struct damon_addr_range regions[3]) +{ + struct mm_struct *mm; + int rc; + + mm = damon_get_mm(t); + if (!mm) + return -EINVAL; + + mmap_read_lock(mm); + rc = __damon_va_three_regions(mm, regions); + mmap_read_unlock(mm); + + mmput(mm); + return rc; +} + +/* + * Initialize the monitoring target regions for the given target (task) + * + * t the given target + * + * Because only a number of small portions of the entire address space + * is actually mapped to the memory and accessed, monitoring the unmapped + * regions is wasteful. That said, because we can deal with small noises, + * tracking every mapping is not strictly required but could even incur a high + * overhead if the mapping frequently changes or the number of mappings is + * high. The adaptive regions adjustment mechanism will further help to deal + * with the noise by simply identifying the unmapped areas as a region that + * has no access. Moreover, applying the real mappings that would have many + * unmapped areas inside will make the adaptive mechanism quite complex. That + * said, too huge unmapped areas inside the monitoring target should be removed + * to not take the time for the adaptive mechanism. + * + * For the reason, we convert the complex mappings to three distinct regions + * that cover every mapped area of the address space. Also the two gaps + * between the three regions are the two biggest unmapped areas in the given + * address space. In detail, this function first identifies the start and the + * end of the mappings and the two biggest unmapped areas of the address space. + * Then, it constructs the three regions as below: + * + * [mappings[0]->start, big_two_unmapped_areas[0]->start) + * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) + * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) + * + * As usual memory map of processes is as below, the gap between the heap and + * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed + * region and the stack will be two biggest unmapped regions. Because these + * gaps are exceptionally huge areas in usual address space, excluding these + * two biggest unmapped regions will be sufficient to make a trade-off. + * + * + * + * + * (other mmap()-ed regions and small unmapped regions) + * + * + * + */ +static void __damon_va_init_regions(struct damon_ctx *ctx, + struct damon_target *t) +{ + struct damon_target *ti; + struct damon_region *r; + struct damon_addr_range regions[3]; + unsigned long sz = 0, nr_pieces; + int i, tidx = 0; + + if (damon_va_three_regions(t, regions)) { + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); + return; + } + + for (i = 0; i < 3; i++) + sz += regions[i].end - regions[i].start; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; + if (sz < DAMON_MIN_REGION) + sz = DAMON_MIN_REGION; + + /* Set the initial three regions of the target */ + for (i = 0; i < 3; i++) { + r = damon_new_region(regions[i].start, regions[i].end); + if (!r) { + pr_err("%d'th init region creation failed\n", i); + return; + } + damon_add_region(r, t); + + nr_pieces = (regions[i].end - regions[i].start) / sz; + damon_va_evenly_split_region(t, r, nr_pieces); + } +} + +/* Initialize '->regions_list' of every target (task) */ +static void damon_va_init(struct damon_ctx *ctx) +{ + struct damon_target *t; + + damon_for_each_target(t, ctx) { + /* the user may set the target regions as they want */ + if (!damon_nr_regions(t)) + __damon_va_init_regions(ctx, t); + } +} + +/* + * Update regions for current memory mappings + */ +static void damon_va_update(struct damon_ctx *ctx) +{ + struct damon_addr_range three_regions[3]; + struct damon_target *t; + + damon_for_each_target(t, ctx) { + if (damon_va_three_regions(t, three_regions)) + continue; + damon_set_regions(t, three_regions, 3); + } +} + +static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_huge(*pmd)) { + damon_pmdp_mkold(pmd, walk->vma, addr); + spin_unlock(ptl); + return 0; + } + spin_unlock(ptl); + } + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return 0; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + damon_ptep_mkold(pte, walk->vma, addr); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + set_huge_pte_at(mm, addr, pte, entry); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static const struct mm_walk_ops damon_mkold_ops = { + .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, +}; + +static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); + mmap_read_unlock(mm); +} + +/* + * Functions for the access checking of the regions + */ + +static void __damon_va_prepare_access_check(struct mm_struct *mm, + struct damon_region *r) +{ + r->sampling_addr = damon_rand(r->ar.start, r->ar.end); + + damon_va_mkold(mm, r->sampling_addr); +} + +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + damon_for_each_region(r, t) + __damon_va_prepare_access_check(mm, r); + mmput(mm); + } +} + +struct damon_young_walk_private { + unsigned long *page_sz; + bool young; +}; + +static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t *pte; + spinlock_t *ptl; + struct page *page; + struct damon_young_walk_private *priv = walk->private; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!pmd_trans_huge(*pmd)) { + spin_unlock(ptl); + goto regular_page; + } + page = damon_get_page(pmd_pfn(*pmd)); + if (!page) + goto huge_out; + if (pmd_young(*pmd) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, + addr)) { + *priv->page_sz = HPAGE_PMD_SIZE; + priv->young = true; + } + put_page(page); +huge_out: + spin_unlock(ptl); + return 0; + } + +regular_page: +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + return -EINVAL; + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!pte_present(*pte)) + goto out; + page = damon_get_page(pte_pfn(*pte)); + if (!page) + goto out; + if (pte_young(*pte) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = PAGE_SIZE; + priv->young = true; + } + put_page(page); +out: + pte_unmap_unlock(pte, ptl); + return 0; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static const struct mm_walk_ops damon_young_ops = { + .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, +}; + +static bool damon_va_young(struct mm_struct *mm, unsigned long addr, + unsigned long *page_sz) +{ + struct damon_young_walk_private arg = { + .page_sz = page_sz, + .young = false, + }; + + mmap_read_lock(mm); + walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); + mmap_read_unlock(mm); + return arg.young; +} + +/* + * Check whether the region was accessed after the last preparation + * + * mm 'mm_struct' for the given virtual address space + * r the region to be checked + */ +static void __damon_va_check_access(struct mm_struct *mm, + struct damon_region *r, bool same_target) +{ + static unsigned long last_addr; + static unsigned long last_page_sz = PAGE_SIZE; + static bool last_accessed; + + /* If the region is in the last checked page, reuse the result */ + if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) == + ALIGN_DOWN(r->sampling_addr, last_page_sz))) { + if (last_accessed) + r->nr_accesses++; + return; + } + + last_accessed = damon_va_young(mm, r->sampling_addr, &last_page_sz); + if (last_accessed) + r->nr_accesses++; + + last_addr = r->sampling_addr; +} + +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +{ + struct damon_target *t; + struct mm_struct *mm; + struct damon_region *r; + unsigned int max_nr_accesses = 0; + bool same_target; + + damon_for_each_target(t, ctx) { + mm = damon_get_mm(t); + if (!mm) + continue; + same_target = false; + damon_for_each_region(r, t) { + __damon_va_check_access(mm, r, same_target); + max_nr_accesses = max(r->nr_accesses, max_nr_accesses); + same_target = true; + } + mmput(mm); + } + + return max_nr_accesses; +} + +/* + * Functions for the target validity check and cleanup + */ + +static bool damon_va_target_valid(struct damon_target *t) +{ + struct task_struct *task; + + task = damon_get_task_struct(t); + if (task) { + put_task_struct(task); + return true; + } + + return false; +} + +#ifndef CONFIG_ADVISE_SYSCALLS +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) +{ + return 0; +} +#else +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) +{ + struct mm_struct *mm; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(damon_sz_region(r)); + unsigned long applied; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + applied = do_madvise(mm, start, len, behavior) ? 0 : len; + mmput(mm); + + return applied; +} +#endif /* CONFIG_ADVISE_SYSCALLS */ + +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + int madv_action; + + switch (scheme->action) { + case DAMOS_WILLNEED: + madv_action = MADV_WILLNEED; + break; + case DAMOS_COLD: + madv_action = MADV_COLD; + break; + case DAMOS_PAGEOUT: + madv_action = MADV_PAGEOUT; + break; + case DAMOS_HUGEPAGE: + madv_action = MADV_HUGEPAGE; + break; + case DAMOS_NOHUGEPAGE: + madv_action = MADV_NOHUGEPAGE; + break; + case DAMOS_STAT: + return 0; + default: + /* + * DAMOS actions that are not yet supported by 'vaddr'. + */ + return 0; + } + + return damos_madvise(t, r, madv_action); +} + +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_cold_score(context, r, scheme); + default: + break; + } + + return DAMOS_MAX_SCORE; +} + +static int __init damon_va_initcall(void) +{ + struct damon_operations ops = { + .id = DAMON_OPS_VADDR, + .init = damon_va_init, + .update = damon_va_update, + .prepare_access_checks = damon_va_prepare_access_checks, + .check_accesses = damon_va_check_accesses, + .reset_aggregated = NULL, + .target_valid = damon_va_target_valid, + .cleanup = NULL, + .apply_scheme = damon_va_apply_scheme, + .get_scheme_score = damon_va_scheme_score, + }; + /* ops for fixed virtual address ranges */ + struct damon_operations ops_fvaddr = ops; + int err; + + /* Don't set the monitoring target regions for the entire mapping */ + ops_fvaddr.id = DAMON_OPS_FVADDR; + ops_fvaddr.init = NULL; + ops_fvaddr.update = NULL; + + err = damon_register_ops(&ops); + if (err) + return err; + return damon_register_ops(&ops_fvaddr); +}; + +subsys_initcall(damon_va_initcall); + +#include "vaddr-test.h" diff --git a/mm/debug.c b/mm/debug.c new file mode 100644 index 000000000..0fd15ba70 --- /dev/null +++ b/mm/debug.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/debug.c + * + * mm/ specific debug routines. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include + +/* + * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can + * be used to populate migrate_reason_names[]. + */ +#undef EM +#undef EMe +#define EM(a, b) b, +#define EMe(a, b) b + +const char *migrate_reason_names[MR_TYPES] = { + MIGRATE_REASON +}; + +const struct trace_print_flags pageflag_names[] = { + __def_pageflag_names, + {0, NULL} +}; + +const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names, + {0, NULL} +}; + +const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names, + {0, NULL} +}; + +static void __dump_page(struct page *page) +{ + struct folio *folio = page_folio(page); + struct page *head = &folio->page; + struct address_space *mapping; + bool compound = PageCompound(page); + /* + * Accessing the pageblock without the zone lock. It could change to + * "isolate" again in the meantime, but since we are just dumping the + * state for debugging, it should be fine to accept a bit of + * inaccuracy here due to racing. + */ + bool page_cma = is_migrate_cma_page(page); + int mapcount; + char *type = ""; + + if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) { + /* + * Corrupt page, so we cannot call page_mapping. Instead, do a + * safe subset of the steps that page_mapping() does. Caution: + * this will be misleading for tail pages, PageSwapCache pages, + * and potentially other situations. (See the page_mapping() + * implementation for what's missing here.) + */ + unsigned long tmp = (unsigned long)page->mapping; + + if (tmp & PAGE_MAPPING_ANON) + mapping = NULL; + else + mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS); + head = page; + folio = (struct folio *)page; + compound = false; + } else { + mapping = page_mapping(page); + } + + /* + * Avoid VM_BUG_ON() in page_mapcount(). + * page->_mapcount space in struct page is used by sl[aou]b pages to + * encode own info. + */ + mapcount = PageSlab(head) ? 0 : page_mapcount(page); + + pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", + page, page_ref_count(head), mapcount, mapping, + page_to_pgoff(page), page_to_pfn(page)); + if (compound) { + pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + head, compound_order(head), + folio_entire_mapcount(folio), + head_compound_pincount(head)); + } + +#ifdef CONFIG_MEMCG + if (head->memcg_data) + pr_warn("memcg:%lx\n", head->memcg_data); +#endif + if (PageKsm(page)) + type = "ksm "; + else if (PageAnon(page)) + type = "anon "; + else if (mapping) + dump_mapping(mapping); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + + pr_warn("%sflags: %pGp%s\n", type, &head->flags, + page_cma ? " CMA" : ""); + print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), page, + sizeof(struct page), false); + if (head != page) + print_hex_dump(KERN_WARNING, "head: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), head, + sizeof(struct page), false); +} + +void dump_page(struct page *page, const char *reason) +{ + if (PagePoisoned(page)) + pr_warn("page:%p is uninitialized and poisoned", page); + else + __dump_page(page); + if (reason) + pr_warn("page dumped because: %s\n", reason); + dump_page_owner(page); +} +EXPORT_SYMBOL(dump_page); + +#ifdef CONFIG_DEBUG_VM + +void dump_vma(const struct vm_area_struct *vma) +{ + pr_emerg("vma %px start %px end %px mm %px\n" + "prot %lx anon_vma %px vm_ops %px\n" + "pgoff %lx file %px private_data %px\n" + "flags: %#lx(%pGv)\n", + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, + (unsigned long)pgprot_val(vma->vm_page_prot), + vma->anon_vma, vma->vm_ops, vma->vm_pgoff, + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags); +} +EXPORT_SYMBOL(dump_vma); + +void dump_mm(const struct mm_struct *mm) +{ + pr_emerg("mm %px task_size %lu\n" +#ifdef CONFIG_MMU + "get_unmapped_area %px\n" +#endif + "mmap_base %lu mmap_legacy_base %lu\n" + "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" + "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" + "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" + "start_code %lx end_code %lx start_data %lx end_data %lx\n" + "start_brk %lx brk %lx start_stack %lx\n" + "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" + "binfmt %px flags %lx\n" +#ifdef CONFIG_AIO + "ioctx_table %px\n" +#endif +#ifdef CONFIG_MEMCG + "owner %px " +#endif + "exe_file %px\n" +#ifdef CONFIG_MMU_NOTIFIER + "notifier_subscriptions %px\n" +#endif +#ifdef CONFIG_NUMA_BALANCING + "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" +#endif + "tlb_flush_pending %d\n" + "def_flags: %#lx(%pGv)\n", + + mm, mm->task_size, +#ifdef CONFIG_MMU + mm->get_unmapped_area, +#endif + mm->mmap_base, mm->mmap_legacy_base, + mm->pgd, atomic_read(&mm->mm_users), + atomic_read(&mm->mm_count), + mm_pgtables_bytes(mm), + mm->map_count, + mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, + (u64)atomic64_read(&mm->pinned_vm), + mm->data_vm, mm->exec_vm, mm->stack_vm, + mm->start_code, mm->end_code, mm->start_data, mm->end_data, + mm->start_brk, mm->brk, mm->start_stack, + mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, + mm->binfmt, mm->flags, +#ifdef CONFIG_AIO + mm->ioctx_table, +#endif +#ifdef CONFIG_MEMCG + mm->owner, +#endif + mm->exe_file, +#ifdef CONFIG_MMU_NOTIFIER + mm->notifier_subscriptions, +#endif +#ifdef CONFIG_NUMA_BALANCING + mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, +#endif + atomic_read(&mm->tlb_flush_pending), + mm->def_flags, &mm->def_flags + ); +} + +static bool page_init_poisoning __read_mostly = true; + +static int __init setup_vm_debug(char *str) +{ + bool __page_init_poisoning = true; + + /* + * Calling vm_debug with no arguments is equivalent to requesting + * to enable all debugging options we can control. + */ + if (*str++ != '=' || !*str) + goto out; + + __page_init_poisoning = false; + if (*str == '-') + goto out; + + while (*str) { + switch (tolower(*str)) { + case'p': + __page_init_poisoning = true; + break; + default: + pr_err("vm_debug option '%c' unknown. skipped\n", + *str); + } + + str++; + } +out: + if (page_init_poisoning && !__page_init_poisoning) + pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n"); + + page_init_poisoning = __page_init_poisoning; + + return 1; +} +__setup("vm_debug", setup_vm_debug); + +void page_init_poison(struct page *page, size_t size) +{ + if (page_init_poisoning) + memset(page, PAGE_POISON_PATTERN, size); +} +#endif /* CONFIG_DEBUG_VM */ diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c new file mode 100644 index 000000000..f3b2c9d3e --- /dev/null +++ b/mm/debug_page_ref.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#define CREATE_TRACE_POINTS +#include + +void __page_ref_set(struct page *page, int v) +{ + trace_page_ref_set(page, v); +} +EXPORT_SYMBOL(__page_ref_set); +EXPORT_TRACEPOINT_SYMBOL(page_ref_set); + +void __page_ref_mod(struct page *page, int v) +{ + trace_page_ref_mod(page, v); +} +EXPORT_SYMBOL(__page_ref_mod); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod); + +void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_test(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_test); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test); + +void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_return(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_return); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return); + +void __page_ref_mod_unless(struct page *page, int v, int u) +{ + trace_page_ref_mod_unless(page, v, u); +} +EXPORT_SYMBOL(__page_ref_mod_unless); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless); + +void __page_ref_freeze(struct page *page, int v, int ret) +{ + trace_page_ref_freeze(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_freeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze); + +void __page_ref_unfreeze(struct page *page, int v) +{ + trace_page_ref_unfreeze(page, v); +} +EXPORT_SYMBOL(__page_ref_unfreeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze); diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c new file mode 100644 index 000000000..dc7df1254 --- /dev/null +++ b/mm/debug_vm_pgtable.c @@ -0,0 +1,1357 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This kernel test validates architecture page table helpers and + * accessors and helps in verifying their continued compliance with + * expected generic MM semantics. + * + * Copyright (C) 2019 ARM Ltd. + * + * Author: Anshuman Khandual + */ +#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics + * expectations that are being validated here. All future changes in here + * or the documentation need to be in sync. + */ + +#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) + +/* + * On s390 platform, the lower 4 bits are used to identify given page table + * entry type. But these bits might affect the ability to clear entries with + * pxx_clear() because of how dynamic page table folding works on s390. So + * while loading up the entries do not change the lower 4 bits. It does not + * have affect any other platform. Also avoid the 62nd bit on ppc64 that is + * used to mark a pte entry. + */ +#define S390_SKIP_MASK GENMASK(3, 0) +#if __BITS_PER_LONG == 64 +#define PPC64_SKIP_MASK GENMASK(62, 62) +#else +#define PPC64_SKIP_MASK 0x0 +#endif +#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK) +#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) +#define RANDOM_NZVALUE GENMASK(7, 0) + +struct pgtable_debug_args { + struct mm_struct *mm; + struct vm_area_struct *vma; + + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + p4d_t *start_p4dp; + pud_t *start_pudp; + pmd_t *start_pmdp; + pgtable_t start_ptep; + + unsigned long vaddr; + pgprot_t page_prot; + pgprot_t page_prot_none; + + bool is_contiguous_page; + unsigned long pud_pfn; + unsigned long pmd_pfn; + unsigned long pte_pfn; + + unsigned long fixed_pgd_pfn; + unsigned long fixed_p4d_pfn; + unsigned long fixed_pud_pfn; + unsigned long fixed_pmd_pfn; + unsigned long fixed_pte_pfn; +}; + +static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx) +{ + pgprot_t prot = vm_get_page_prot(idx); + pte_t pte = pfn_pte(args->fixed_pte_pfn, prot); + unsigned long val = idx, *ptr = &val; + + pr_debug("Validating PTE basic (%pGv)\n", ptr); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pte() to make sure that vm_get_page_prot(idx) + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pte_dirty(pte_wrprotect(pte))); + + WARN_ON(!pte_same(pte, pte)); + WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); + WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); + WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte)))); + WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); + WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); + WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); + WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte)))); + WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); +} + +static void __init pte_advanced_tests(struct pgtable_debug_args *args) +{ + struct page *page; + pte_t pte; + + /* + * Architectures optimize set_pte_at by avoiding TLB flush. + * This requires set_pte_at to be not used to update an + * existing pte entry. Clear pte before we do set_pte_at + * + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) + return; + + pr_debug("Validating PTE advanced\n"); + pte = pfn_pte(args->pte_pfn, args->page_prot); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); + ptep_set_wrprotect(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); + WARN_ON(pte_write(pte)); + ptep_get_and_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); + WARN_ON(!pte_none(pte)); + + pte = pfn_pte(args->pte_pfn, args->page_prot); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); + pte = pte_mkwrite(pte); + pte = pte_mkdirty(pte); + ptep_set_access_flags(args->vma, args->vaddr, args->ptep, pte, 1); + pte = ptep_get(args->ptep); + WARN_ON(!(pte_write(pte) && pte_dirty(pte))); + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); + pte = ptep_get(args->ptep); + WARN_ON(!pte_none(pte)); + + pte = pfn_pte(args->pte_pfn, args->page_prot); + pte = pte_mkyoung(pte); + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); + ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); + WARN_ON(pte_young(pte)); + + ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); +} + +static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) +{ + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); + + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + + pr_debug("Validating PTE saved write\n"); + WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); + WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) +{ + pgprot_t prot = vm_get_page_prot(idx); + unsigned long val = idx, *ptr = &val; + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD basic (%pGv)\n", ptr); + pmd = pfn_pmd(args->fixed_pmd_pfn, prot); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pmd() to make sure that vm_get_page_prot(idx) + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pmd_dirty(pmd_wrprotect(pmd))); + + + WARN_ON(!pmd_same(pmd, pmd)); + WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); + WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); + WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd)))); + WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); + WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); + WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd)))); + WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd)))); + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pmd_bad(). + */ + WARN_ON(!pmd_bad(pmd_mkhuge(pmd))); +} + +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) +{ + struct page *page; + pmd_t pmd; + unsigned long vaddr = args->vaddr; + + if (!has_transparent_hugepage()) + return; + + page = (args->pmd_pfn != ULONG_MAX) ? pfn_to_page(args->pmd_pfn) : NULL; + if (!page) + return; + + /* + * flush_dcache_page() is called after set_pmd_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ + pr_debug("Validating PMD advanced\n"); + /* Align the address wrt HPAGE_PMD_SIZE */ + vaddr &= HPAGE_PMD_MASK; + + pgtable_trans_huge_deposit(args->mm, args->pmdp, args->start_ptep); + + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); + pmdp_set_wrprotect(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(pmd_write(pmd)); + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(!pmd_none(pmd)); + + pmd = pfn_pmd(args->pmd_pfn, args->page_prot); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); + pmd = pmd_mkwrite(pmd); + pmd = pmd_mkdirty(pmd); + pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd))); + pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(!pmd_none(pmd)); + + pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot)); + pmd = pmd_mkyoung(pmd); + set_pmd_at(args->mm, vaddr, args->pmdp, pmd); + flush_dcache_page(page); + pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(pmd_young(pmd)); + + /* Clear the pte entries */ + pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp); + pgtable_trans_huge_withdraw(args->mm, args->pmdp); +} + +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD leaf\n"); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + + /* + * PMD based THP is a leaf entry. + */ + pmd = pmd_mkhuge(pmd); + WARN_ON(!pmd_leaf(pmd)); +} + +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD saved write\n"); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); + WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); + WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) +{ + pgprot_t prot = vm_get_page_prot(idx); + unsigned long val = idx, *ptr = &val; + pud_t pud; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PUD basic (%pGv)\n", ptr); + pud = pfn_pud(args->fixed_pud_pfn, prot); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pud() to make sure that vm_get_page_prot(idx) + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pud_dirty(pud_wrprotect(pud))); + + WARN_ON(!pud_same(pud, pud)); + WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud)))); + WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud)))); + WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); + WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); + WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud)))); + WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud)))); + + if (mm_pmd_folded(args->mm)) + return; + + /* + * A huge page does not point to next level page table + * entry. Hence this must qualify as pud_bad(). + */ + WARN_ON(!pud_bad(pud_mkhuge(pud))); +} + +static void __init pud_advanced_tests(struct pgtable_debug_args *args) +{ + struct page *page; + unsigned long vaddr = args->vaddr; + pud_t pud; + + if (!has_transparent_hugepage()) + return; + + page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL; + if (!page) + return; + + /* + * flush_dcache_page() is called after set_pud_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ + pr_debug("Validating PUD advanced\n"); + /* Align the address wrt HPAGE_PUD_SIZE */ + vaddr &= HPAGE_PUD_MASK; + + pud = pfn_pud(args->pud_pfn, args->page_prot); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); + pudp_set_wrprotect(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); + WARN_ON(pud_write(pud)); + +#ifndef __PAGETABLE_PMD_FOLDED + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); + WARN_ON(!pud_none(pud)); +#endif /* __PAGETABLE_PMD_FOLDED */ + pud = pfn_pud(args->pud_pfn, args->page_prot); + pud = pud_wrprotect(pud); + pud = pud_mkclean(pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); + pud = pud_mkwrite(pud); + pud = pud_mkdirty(pud); + pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1); + pud = READ_ONCE(*args->pudp); + WARN_ON(!(pud_write(pud) && pud_dirty(pud))); + +#ifndef __PAGETABLE_PMD_FOLDED + pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pud = READ_ONCE(*args->pudp); + WARN_ON(!pud_none(pud)); +#endif /* __PAGETABLE_PMD_FOLDED */ + + pud = pfn_pud(args->pud_pfn, args->page_prot); + pud = pud_mkyoung(pud); + set_pud_at(args->mm, vaddr, args->pudp, pud); + flush_dcache_page(page); + pudp_test_and_clear_young(args->vma, vaddr, args->pudp); + pud = READ_ONCE(*args->pudp); + WARN_ON(pud_young(pud)); + + pudp_huge_get_and_clear(args->mm, vaddr, args->pudp); +} + +static void __init pud_leaf_tests(struct pgtable_debug_args *args) +{ + pud_t pud; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PUD leaf\n"); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); + /* + * PUD based THP is a leaf entry. + */ + pud = pud_mkhuge(pud); + WARN_ON(!pud_leaf(pud)); +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { } +static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } +static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } +static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } +static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +static void __init pmd_huge_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!arch_vmap_pmd_supported(args->page_prot)) + return; + + pr_debug("Validating PMD huge\n"); + /* + * X86 defined pmd_set_huge() verifies that the given + * PMD is not a populated non-leaf entry. + */ + WRITE_ONCE(*args->pmdp, __pmd(0)); + WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot)); + WARN_ON(!pmd_clear_huge(args->pmdp)); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(!pmd_none(pmd)); +} + +static void __init pud_huge_tests(struct pgtable_debug_args *args) +{ + pud_t pud; + + if (!arch_vmap_pud_supported(args->page_prot)) + return; + + pr_debug("Validating PUD huge\n"); + /* + * X86 defined pud_set_huge() verifies that the given + * PUD is not a populated non-leaf entry. + */ + WRITE_ONCE(*args->pudp, __pud(0)); + WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot)); + WARN_ON(!pud_clear_huge(args->pudp)); + pud = READ_ONCE(*args->pudp); + WARN_ON(!pud_none(pud)); +} +#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ +static void __init pmd_huge_tests(struct pgtable_debug_args *args) { } +static void __init pud_huge_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + +static void __init p4d_basic_tests(struct pgtable_debug_args *args) +{ + p4d_t p4d; + + pr_debug("Validating P4D basic\n"); + memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t)); + WARN_ON(!p4d_same(p4d, p4d)); +} + +static void __init pgd_basic_tests(struct pgtable_debug_args *args) +{ + pgd_t pgd; + + pr_debug("Validating PGD basic\n"); + memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t)); + WARN_ON(!pgd_same(pgd, pgd)); +} + +#ifndef __PAGETABLE_PUD_FOLDED +static void __init pud_clear_tests(struct pgtable_debug_args *args) +{ + pud_t pud = READ_ONCE(*args->pudp); + + if (mm_pmd_folded(args->mm)) + return; + + pr_debug("Validating PUD clear\n"); + pud = __pud(pud_val(pud) | RANDOM_ORVALUE); + WRITE_ONCE(*args->pudp, pud); + pud_clear(args->pudp); + pud = READ_ONCE(*args->pudp); + WARN_ON(!pud_none(pud)); +} + +static void __init pud_populate_tests(struct pgtable_debug_args *args) +{ + pud_t pud; + + if (mm_pmd_folded(args->mm)) + return; + + pr_debug("Validating PUD populate\n"); + /* + * This entry points to next level page table page. + * Hence this must not qualify as pud_bad(). + */ + pud_populate(args->mm, args->pudp, args->start_pmdp); + pud = READ_ONCE(*args->pudp); + WARN_ON(pud_bad(pud)); +} +#else /* !__PAGETABLE_PUD_FOLDED */ +static void __init pud_clear_tests(struct pgtable_debug_args *args) { } +static void __init pud_populate_tests(struct pgtable_debug_args *args) { } +#endif /* PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_P4D_FOLDED +static void __init p4d_clear_tests(struct pgtable_debug_args *args) +{ + p4d_t p4d = READ_ONCE(*args->p4dp); + + if (mm_pud_folded(args->mm)) + return; + + pr_debug("Validating P4D clear\n"); + p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE); + WRITE_ONCE(*args->p4dp, p4d); + p4d_clear(args->p4dp); + p4d = READ_ONCE(*args->p4dp); + WARN_ON(!p4d_none(p4d)); +} + +static void __init p4d_populate_tests(struct pgtable_debug_args *args) +{ + p4d_t p4d; + + if (mm_pud_folded(args->mm)) + return; + + pr_debug("Validating P4D populate\n"); + /* + * This entry points to next level page table page. + * Hence this must not qualify as p4d_bad(). + */ + pud_clear(args->pudp); + p4d_clear(args->p4dp); + p4d_populate(args->mm, args->p4dp, args->start_pudp); + p4d = READ_ONCE(*args->p4dp); + WARN_ON(p4d_bad(p4d)); +} + +static void __init pgd_clear_tests(struct pgtable_debug_args *args) +{ + pgd_t pgd = READ_ONCE(*(args->pgdp)); + + if (mm_p4d_folded(args->mm)) + return; + + pr_debug("Validating PGD clear\n"); + pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE); + WRITE_ONCE(*args->pgdp, pgd); + pgd_clear(args->pgdp); + pgd = READ_ONCE(*args->pgdp); + WARN_ON(!pgd_none(pgd)); +} + +static void __init pgd_populate_tests(struct pgtable_debug_args *args) +{ + pgd_t pgd; + + if (mm_p4d_folded(args->mm)) + return; + + pr_debug("Validating PGD populate\n"); + /* + * This entry points to next level page table page. + * Hence this must not qualify as pgd_bad(). + */ + p4d_clear(args->p4dp); + pgd_clear(args->pgdp); + pgd_populate(args->mm, args->pgdp, args->start_p4dp); + pgd = READ_ONCE(*args->pgdp); + WARN_ON(pgd_bad(pgd)); +} +#else /* !__PAGETABLE_P4D_FOLDED */ +static void __init p4d_clear_tests(struct pgtable_debug_args *args) { } +static void __init pgd_clear_tests(struct pgtable_debug_args *args) { } +static void __init p4d_populate_tests(struct pgtable_debug_args *args) { } +static void __init pgd_populate_tests(struct pgtable_debug_args *args) { } +#endif /* PAGETABLE_P4D_FOLDED */ + +static void __init pte_clear_tests(struct pgtable_debug_args *args) +{ + struct page *page; + pte_t pte = pfn_pte(args->pte_pfn, args->page_prot); + + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) + return; + + /* + * flush_dcache_page() is called after set_pte_at() to clear + * PG_arch_1 for the page on ARM64. The page flag isn't cleared + * when it's released and page allocation check will fail when + * the page is allocated again. For architectures other than ARM64, + * the unexpected overhead of cache flushing is acceptable. + */ + pr_debug("Validating PTE clear\n"); +#ifndef CONFIG_RISCV + pte = __pte(pte_val(pte) | RANDOM_ORVALUE); +#endif + set_pte_at(args->mm, args->vaddr, args->ptep, pte); + flush_dcache_page(page); + barrier(); + ptep_clear(args->mm, args->vaddr, args->ptep); + pte = ptep_get(args->ptep); + WARN_ON(!pte_none(pte)); +} + +static void __init pmd_clear_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd = READ_ONCE(*args->pmdp); + + pr_debug("Validating PMD clear\n"); + pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE); + WRITE_ONCE(*args->pmdp, pmd); + pmd_clear(args->pmdp); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(!pmd_none(pmd)); +} + +static void __init pmd_populate_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + pr_debug("Validating PMD populate\n"); + /* + * This entry points to next level page table page. + * Hence this must not qualify as pmd_bad(). + */ + pmd_populate(args->mm, args->pmdp, args->start_ptep); + pmd = READ_ONCE(*args->pmdp); + WARN_ON(pmd_bad(pmd)); +} + +static void __init pte_special_tests(struct pgtable_debug_args *args) +{ + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) + return; + + pr_debug("Validating PTE special\n"); + WARN_ON(!pte_special(pte_mkspecial(pte))); +} + +static void __init pte_protnone_tests(struct pgtable_debug_args *args) +{ + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); + + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + + pr_debug("Validating PTE protnone\n"); + WARN_ON(!pte_protnone(pte)); + WARN_ON(!pte_present(pte)); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) + return; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD protnone\n"); + pmd = pmd_mkhuge(pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none)); + WARN_ON(!pmd_protnone(pmd)); + WARN_ON(!pmd_present(pmd)); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_protnone_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP +static void __init pte_devmap_tests(struct pgtable_debug_args *args) +{ + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + + pr_debug("Validating PTE devmap\n"); + WARN_ON(!pte_devmap(pte_mkdevmap(pte))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD devmap\n"); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd))); +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_devmap_tests(struct pgtable_debug_args *args) +{ + pud_t pud; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PUD devmap\n"); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); + WARN_ON(!pud_devmap(pud_mkdevmap(pud))); +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#else +static void __init pte_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pmd_devmap_tests(struct pgtable_debug_args *args) { } +static void __init pud_devmap_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ + +static void __init pte_soft_dirty_tests(struct pgtable_debug_args *args) +{ + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + return; + + pr_debug("Validating PTE soft dirty\n"); + WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte))); + WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte))); +} + +static void __init pte_swap_soft_dirty_tests(struct pgtable_debug_args *args) +{ + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + return; + + pr_debug("Validating PTE swap soft dirty\n"); + WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte))); + WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte))); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + return; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD soft dirty\n"); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd))); + WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd))); +} + +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) || + !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION)) + return; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD swap soft dirty\n"); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd))); + WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd))); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { } +static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args) +{ +#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE + pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + + pr_debug("Validating PTE swap exclusive\n"); + pte = pte_swp_mkexclusive(pte); + WARN_ON(!pte_swp_exclusive(pte)); + pte = pte_swp_clear_exclusive(pte); + WARN_ON(pte_swp_exclusive(pte)); +#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */ +} + +static void __init pte_swap_tests(struct pgtable_debug_args *args) +{ + swp_entry_t swp; + pte_t pte; + + pr_debug("Validating PTE swap\n"); + pte = pfn_pte(args->fixed_pte_pfn, args->page_prot); + swp = __pte_to_swp_entry(pte); + pte = __swp_entry_to_pte(swp); + WARN_ON(args->fixed_pte_pfn != pte_pfn(pte)); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static void __init pmd_swap_tests(struct pgtable_debug_args *args) +{ + swp_entry_t swp; + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD swap\n"); + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + swp = __pmd_to_swp_entry(pmd); + pmd = __swp_entry_to_pmd(swp); + WARN_ON(args->fixed_pmd_pfn != pmd_pfn(pmd)); +} +#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static void __init pmd_swap_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static void __init swap_migration_tests(struct pgtable_debug_args *args) +{ + struct page *page; + swp_entry_t swp; + + if (!IS_ENABLED(CONFIG_MIGRATION)) + return; + + /* + * swap_migration_tests() requires a dedicated page as it needs to + * be locked before creating a migration entry from it. Locking the + * page that actually maps kernel text ('start_kernel') can be real + * problematic. Lets use the allocated page explicitly for this + * purpose. + */ + page = (args->pte_pfn != ULONG_MAX) ? pfn_to_page(args->pte_pfn) : NULL; + if (!page) + return; + + pr_debug("Validating swap migration\n"); + + /* + * make_[readable|writable]_migration_entry() expects given page to + * be locked, otherwise it stumbles upon a BUG_ON(). + */ + __SetPageLocked(page); + swp = make_writable_migration_entry(page_to_pfn(page)); + WARN_ON(!is_migration_entry(swp)); + WARN_ON(!is_writable_migration_entry(swp)); + + swp = make_readable_migration_entry(swp_offset(swp)); + WARN_ON(!is_migration_entry(swp)); + WARN_ON(is_writable_migration_entry(swp)); + + swp = make_readable_migration_entry(page_to_pfn(page)); + WARN_ON(!is_migration_entry(swp)); + WARN_ON(is_writable_migration_entry(swp)); + __ClearPageLocked(page); +} + +#ifdef CONFIG_HUGETLB_PAGE +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) +{ + struct page *page; + pte_t pte; + + pr_debug("Validating HugeTLB basic\n"); + /* + * Accessing the page associated with the pfn is safe here, + * as it was previously derived from a real kernel symbol. + */ + page = pfn_to_page(args->fixed_pmd_pfn); + pte = mk_huge_pte(page, args->page_prot); + + WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte))); + WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte)))); + WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte)))); + +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB + pte = pfn_pte(args->fixed_pmd_pfn, args->page_prot); + + WARN_ON(!pte_huge(pte_mkhuge(pte))); +#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ +} +#else /* !CONFIG_HUGETLB_PAGE */ +static void __init hugetlb_basic_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_HUGETLB_PAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void __init pmd_thp_tests(struct pgtable_debug_args *args) +{ + pmd_t pmd; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PMD based THP\n"); + /* + * pmd_trans_huge() and pmd_present() must return positive after + * MMU invalidation with pmd_mkinvalid(). This behavior is an + * optimization for transparent huge page. pmd_trans_huge() must + * be true if pmd_page() returns a valid THP to avoid taking the + * pmd_lock when others walk over non transhuge pmds (i.e. there + * are no THP allocated). Especially when splitting a THP and + * removing the present bit from the pmd, pmd_trans_huge() still + * needs to return true. pmd_present() should be true whenever + * pmd_trans_huge() returns true. + */ + pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot); + WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd))); + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE + WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd)))); + WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd)))); +#endif /* __HAVE_ARCH_PMDP_INVALIDATE */ +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void __init pud_thp_tests(struct pgtable_debug_args *args) +{ + pud_t pud; + + if (!has_transparent_hugepage()) + return; + + pr_debug("Validating PUD based THP\n"); + pud = pfn_pud(args->fixed_pud_pfn, args->page_prot); + WARN_ON(!pud_trans_huge(pud_mkhuge(pud))); + + /* + * pud_mkinvalid() has been dropped for now. Enable back + * these tests when it comes back with a modified pud_present(). + * + * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud)))); + * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud)))); + */ +} +#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ +static void __init pmd_thp_tests(struct pgtable_debug_args *args) { } +static void __init pud_thp_tests(struct pgtable_debug_args *args) { } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static unsigned long __init get_random_vaddr(void) +{ + unsigned long random_vaddr, random_pages, total_user_pages; + + total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE; + + random_pages = get_random_long() % total_user_pages; + random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE; + + return random_vaddr; +} + +static void __init destroy_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + + /* Free (huge) page */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage() && + args->pud_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pud_pfn, + (1 << (HPAGE_PUD_SHIFT - PAGE_SHIFT))); + } else { + page = pfn_to_page(args->pud_pfn); + __free_pages(page, HPAGE_PUD_SHIFT - PAGE_SHIFT); + } + + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage() && + args->pmd_pfn != ULONG_MAX) { + if (args->is_contiguous_page) { + free_contig_range(args->pmd_pfn, (1 << HPAGE_PMD_ORDER)); + } else { + page = pfn_to_page(args->pmd_pfn); + __free_pages(page, HPAGE_PMD_ORDER); + } + + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + } + + if (args->pte_pfn != ULONG_MAX) { + page = pfn_to_page(args->pte_pfn); + __free_pages(page, 0); + + args->pte_pfn = ULONG_MAX; + } + + /* Free page table entries */ + if (args->start_ptep) { + pte_free(args->mm, args->start_ptep); + mm_dec_nr_ptes(args->mm); + } + + if (args->start_pmdp) { + pmd_free(args->mm, args->start_pmdp); + mm_dec_nr_pmds(args->mm); + } + + if (args->start_pudp) { + pud_free(args->mm, args->start_pudp); + mm_dec_nr_puds(args->mm); + } + + if (args->start_p4dp) + p4d_free(args->mm, args->start_p4dp); + + /* Free vma and mm struct */ + if (args->vma) + vm_area_free(args->vma); + + if (args->mm) + mmdrop(args->mm); +} + +static struct page * __init +debug_vm_pgtable_alloc_huge_page(struct pgtable_debug_args *args, int order) +{ + struct page *page = NULL; + +#ifdef CONFIG_CONTIG_ALLOC + if (order >= MAX_ORDER) { + page = alloc_contig_pages((1 << order), GFP_KERNEL, + first_online_node, NULL); + if (page) { + args->is_contiguous_page = true; + return page; + } + } +#endif + + if (order < MAX_ORDER) + page = alloc_pages(GFP_KERNEL, order); + + return page; +} + +static int __init init_args(struct pgtable_debug_args *args) +{ + struct page *page = NULL; + phys_addr_t phys; + int ret = 0; + + /* + * Initialize the debugging data. + * + * vm_get_page_prot(VM_NONE) or vm_get_page_prot(VM_SHARED|VM_NONE) + * will help create page table entries with PROT_NONE permission as + * required for pxx_protnone_tests(). + */ + memset(args, 0, sizeof(*args)); + args->vaddr = get_random_vaddr(); + args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot_none = vm_get_page_prot(VM_NONE); + args->is_contiguous_page = false; + args->pud_pfn = ULONG_MAX; + args->pmd_pfn = ULONG_MAX; + args->pte_pfn = ULONG_MAX; + args->fixed_pgd_pfn = ULONG_MAX; + args->fixed_p4d_pfn = ULONG_MAX; + args->fixed_pud_pfn = ULONG_MAX; + args->fixed_pmd_pfn = ULONG_MAX; + args->fixed_pte_pfn = ULONG_MAX; + + /* Allocate mm and vma */ + args->mm = mm_alloc(); + if (!args->mm) { + pr_err("Failed to allocate mm struct\n"); + ret = -ENOMEM; + goto error; + } + + args->vma = vm_area_alloc(args->mm); + if (!args->vma) { + pr_err("Failed to allocate vma\n"); + ret = -ENOMEM; + goto error; + } + + /* + * Allocate page table entries. They will be modified in the tests. + * Lets save the page table entries so that they can be released + * when the tests are completed. + */ + args->pgdp = pgd_offset(args->mm, args->vaddr); + args->p4dp = p4d_alloc(args->mm, args->pgdp, args->vaddr); + if (!args->p4dp) { + pr_err("Failed to allocate p4d entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_p4dp = p4d_offset(args->pgdp, 0UL); + WARN_ON(!args->start_p4dp); + + args->pudp = pud_alloc(args->mm, args->p4dp, args->vaddr); + if (!args->pudp) { + pr_err("Failed to allocate pud entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pudp = pud_offset(args->p4dp, 0UL); + WARN_ON(!args->start_pudp); + + args->pmdp = pmd_alloc(args->mm, args->pudp, args->vaddr); + if (!args->pmdp) { + pr_err("Failed to allocate pmd entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_pmdp = pmd_offset(args->pudp, 0UL); + WARN_ON(!args->start_pmdp); + + if (pte_alloc(args->mm, args->pmdp)) { + pr_err("Failed to allocate pte entries\n"); + ret = -ENOMEM; + goto error; + } + args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp)); + WARN_ON(!args->start_ptep); + + /* + * PFN for mapping at PTE level is determined from a standard kernel + * text symbol. But pfns for higher page table levels are derived by + * masking lower bits of this real pfn. These derived pfns might not + * exist on the platform but that does not really matter as pfn_pxx() + * helpers will still create appropriate entries for the test. This + * helps avoid large memory block allocations to be used for mapping + * at higher page table levels in some of the tests. + */ + phys = __pa_symbol(&start_kernel); + args->fixed_pgd_pfn = __phys_to_pfn(phys & PGDIR_MASK); + args->fixed_p4d_pfn = __phys_to_pfn(phys & P4D_MASK); + args->fixed_pud_pfn = __phys_to_pfn(phys & PUD_MASK); + args->fixed_pmd_pfn = __phys_to_pfn(phys & PMD_MASK); + args->fixed_pte_pfn = __phys_to_pfn(phys & PAGE_MASK); + WARN_ON(!pfn_valid(args->fixed_pte_pfn)); + + /* + * Allocate (huge) pages because some of the tests need to access + * the data in the pages. The corresponding tests will be skipped + * if we fail to allocate (huge) pages. + */ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, + HPAGE_PUD_SHIFT - PAGE_SHIFT); + if (page) { + args->pud_pfn = page_to_pfn(page); + args->pmd_pfn = args->pud_pfn; + args->pte_pfn = args->pud_pfn; + return 0; + } + } + + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage()) { + page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PMD_ORDER); + if (page) { + args->pmd_pfn = page_to_pfn(page); + args->pte_pfn = args->pmd_pfn; + return 0; + } + } + + page = alloc_pages(GFP_KERNEL, 0); + if (page) + args->pte_pfn = page_to_pfn(page); + + return 0; + +error: + destroy_args(args); + return ret; +} + +static int __init debug_vm_pgtable(void) +{ + struct pgtable_debug_args args; + spinlock_t *ptl = NULL; + int idx, ret; + + pr_info("Validating architecture page table helpers\n"); + ret = init_args(&args); + if (ret) + return ret; + + /* + * Iterate over each possible vm_flags to make sure that all + * the basic page table transformation validations just hold + * true irrespective of the starting protection value for a + * given page table entry. + * + * Protection based vm_flags combinatins are always linear + * and increasing i.e starting from VM_NONE and going upto + * (VM_SHARED | READ | WRITE | EXEC). + */ +#define VM_FLAGS_START (VM_NONE) +#define VM_FLAGS_END (VM_SHARED | VM_EXEC | VM_WRITE | VM_READ) + + for (idx = VM_FLAGS_START; idx <= VM_FLAGS_END; idx++) { + pte_basic_tests(&args, idx); + pmd_basic_tests(&args, idx); + pud_basic_tests(&args, idx); + } + + /* + * Both P4D and PGD level tests are very basic which do not + * involve creating page table entries from the protection + * value and the given pfn. Hence just keep them out from + * the above iteration for now to save some test execution + * time. + */ + p4d_basic_tests(&args); + pgd_basic_tests(&args); + + pmd_leaf_tests(&args); + pud_leaf_tests(&args); + + pte_savedwrite_tests(&args); + pmd_savedwrite_tests(&args); + + pte_special_tests(&args); + pte_protnone_tests(&args); + pmd_protnone_tests(&args); + + pte_devmap_tests(&args); + pmd_devmap_tests(&args); + pud_devmap_tests(&args); + + pte_soft_dirty_tests(&args); + pmd_soft_dirty_tests(&args); + pte_swap_soft_dirty_tests(&args); + pmd_swap_soft_dirty_tests(&args); + + pte_swap_exclusive_tests(&args); + + pte_swap_tests(&args); + pmd_swap_tests(&args); + + swap_migration_tests(&args); + + pmd_thp_tests(&args); + pud_thp_tests(&args); + + hugetlb_basic_tests(&args); + + /* + * Page table modifying tests. They need to hold + * proper page table lock. + */ + + args.ptep = pte_offset_map_lock(args.mm, args.pmdp, args.vaddr, &ptl); + pte_clear_tests(&args); + pte_advanced_tests(&args); + pte_unmap_unlock(args.ptep, ptl); + + ptl = pmd_lock(args.mm, args.pmdp); + pmd_clear_tests(&args); + pmd_advanced_tests(&args); + pmd_huge_tests(&args); + pmd_populate_tests(&args); + spin_unlock(ptl); + + ptl = pud_lock(args.mm, args.pudp); + pud_clear_tests(&args); + pud_advanced_tests(&args); + pud_huge_tests(&args); + pud_populate_tests(&args); + spin_unlock(ptl); + + spin_lock(&(args.mm->page_table_lock)); + p4d_clear_tests(&args); + pgd_clear_tests(&args); + p4d_populate_tests(&args); + pgd_populate_tests(&args); + spin_unlock(&(args.mm->page_table_lock)); + + destroy_args(&args); + return 0; +} +late_initcall(debug_vm_pgtable); diff --git a/mm/dmapool.c b/mm/dmapool.c new file mode 100644 index 000000000..a7eb5d0eb --- /dev/null +++ b/mm/dmapool.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * DMA Pool allocator + * + * Copyright 2001 David Brownell + * Copyright 2007 Intel Corporation + * Author: Matthew Wilcox + * + * This allocator returns small blocks of a given size which are DMA-able by + * the given device. It uses the dma_alloc_coherent page allocator to get + * new pages, then splits them up into blocks of the required size. + * Many older drivers still have their own code to do this. + * + * The current design of this allocator is fairly simple. The pool is + * represented by the 'struct dma_pool' which keeps a doubly-linked list of + * allocated pages. Each page in the page_list is split into blocks of at + * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked + * list of free blocks within the page. Used blocks aren't tracked, but we + * keep a count of how many are currently allocated from each page. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +#define DMAPOOL_DEBUG 1 +#endif + +struct dma_pool { /* the pool */ + struct list_head page_list; + spinlock_t lock; + size_t size; + struct device *dev; + size_t allocation; + size_t boundary; + char name[32]; + struct list_head pools; +}; + +struct dma_page { /* cacheable header for 'allocation' bytes */ + struct list_head page_list; + void *vaddr; + dma_addr_t dma; + unsigned int in_use; + unsigned int offset; +}; + +static DEFINE_MUTEX(pools_lock); +static DEFINE_MUTEX(pools_reg_lock); + +static ssize_t pools_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + unsigned temp; + unsigned size; + char *next; + struct dma_page *page; + struct dma_pool *pool; + + next = buf; + size = PAGE_SIZE; + + temp = scnprintf(next, size, "poolinfo - 0.1\n"); + size -= temp; + next += temp; + + mutex_lock(&pools_lock); + list_for_each_entry(pool, &dev->dma_pools, pools) { + unsigned pages = 0; + unsigned blocks = 0; + + spin_lock_irq(&pool->lock); + list_for_each_entry(page, &pool->page_list, page_list) { + pages++; + blocks += page->in_use; + } + spin_unlock_irq(&pool->lock); + + /* per-pool info, no real statistics yet */ + temp = scnprintf(next, size, "%-16s %4u %4zu %4zu %2u\n", + pool->name, blocks, + pages * (pool->allocation / pool->size), + pool->size, pages); + size -= temp; + next += temp; + } + mutex_unlock(&pools_lock); + + return PAGE_SIZE - size; +} + +static DEVICE_ATTR_RO(pools); + +/** + * dma_pool_create - Creates a pool of consistent memory blocks, for dma. + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @boundary: returned blocks won't cross this power of two boundary + * Context: not in_interrupt() + * + * Given one of these pools, dma_pool_alloc() + * may be used to allocate memory. Such memory will all have "consistent" + * DMA mappings, accessible by the device and its driver without using + * cache flushing primitives. The actual size of blocks allocated may be + * larger than requested because of alignment. + * + * If @boundary is nonzero, objects returned from dma_pool_alloc() won't + * cross that size boundary. This is useful for devices which have + * addressing restrictions on individual DMA transfers, such as not crossing + * boundaries of 4KBytes. + * + * Return: a dma allocation pool with the requested characteristics, or + * %NULL if one can't be created. + */ +struct dma_pool *dma_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t boundary) +{ + struct dma_pool *retval; + size_t allocation; + bool empty = false; + + if (align == 0) + align = 1; + else if (align & (align - 1)) + return NULL; + + if (size == 0) + return NULL; + else if (size < 4) + size = 4; + + size = ALIGN(size, align); + allocation = max_t(size_t, size, PAGE_SIZE); + + if (!boundary) + boundary = allocation; + else if ((boundary < size) || (boundary & (boundary - 1))) + return NULL; + + retval = kmalloc(sizeof(*retval), GFP_KERNEL); + if (!retval) + return retval; + + strscpy(retval->name, name, sizeof(retval->name)); + + retval->dev = dev; + + INIT_LIST_HEAD(&retval->page_list); + spin_lock_init(&retval->lock); + retval->size = size; + retval->boundary = boundary; + retval->allocation = allocation; + + INIT_LIST_HEAD(&retval->pools); + + /* + * pools_lock ensures that the ->dma_pools list does not get corrupted. + * pools_reg_lock ensures that there is not a race between + * dma_pool_create() and dma_pool_destroy() or within dma_pool_create() + * when the first invocation of dma_pool_create() failed on + * device_create_file() and the second assumes that it has been done (I + * know it is a short window). + */ + mutex_lock(&pools_reg_lock); + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools)) + empty = true; + list_add(&retval->pools, &dev->dma_pools); + mutex_unlock(&pools_lock); + if (empty) { + int err; + + err = device_create_file(dev, &dev_attr_pools); + if (err) { + mutex_lock(&pools_lock); + list_del(&retval->pools); + mutex_unlock(&pools_lock); + mutex_unlock(&pools_reg_lock); + kfree(retval); + return NULL; + } + } + mutex_unlock(&pools_reg_lock); + return retval; +} +EXPORT_SYMBOL(dma_pool_create); + +static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page) +{ + unsigned int offset = 0; + unsigned int next_boundary = pool->boundary; + + do { + unsigned int next = offset + pool->size; + if (unlikely((next + pool->size) >= next_boundary)) { + next = next_boundary; + next_boundary += pool->boundary; + } + *(int *)(page->vaddr + offset) = next; + offset = next; + } while (offset < pool->allocation); +} + +static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) +{ + struct dma_page *page; + + page = kmalloc(sizeof(*page), mem_flags); + if (!page) + return NULL; + page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, + &page->dma, mem_flags); + if (page->vaddr) { +#ifdef DMAPOOL_DEBUG + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + pool_initialise_page(pool, page); + page->in_use = 0; + page->offset = 0; + } else { + kfree(page); + page = NULL; + } + return page; +} + +static inline bool is_page_busy(struct dma_page *page) +{ + return page->in_use != 0; +} + +static void pool_free_page(struct dma_pool *pool, struct dma_page *page) +{ + dma_addr_t dma = page->dma; + +#ifdef DMAPOOL_DEBUG + memset(page->vaddr, POOL_POISON_FREED, pool->allocation); +#endif + dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); + list_del(&page->page_list); + kfree(page); +} + +/** + * dma_pool_destroy - destroys a pool of dma memory blocks. + * @pool: dma pool that will be destroyed + * Context: !in_interrupt() + * + * Caller guarantees that no more memory from the pool is in use, + * and that nothing will try to use the pool after this call. + */ +void dma_pool_destroy(struct dma_pool *pool) +{ + struct dma_page *page, *tmp; + bool empty = false; + + if (unlikely(!pool)) + return; + + mutex_lock(&pools_reg_lock); + mutex_lock(&pools_lock); + list_del(&pool->pools); + if (pool->dev && list_empty(&pool->dev->dma_pools)) + empty = true; + mutex_unlock(&pools_lock); + if (empty) + device_remove_file(pool->dev, &dev_attr_pools); + mutex_unlock(&pools_reg_lock); + + list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) { + if (is_page_busy(page)) { + if (pool->dev) + dev_err(pool->dev, "%s %s, %p busy\n", __func__, + pool->name, page->vaddr); + else + pr_err("%s %s, %p busy\n", __func__, + pool->name, page->vaddr); + /* leak the still-in-use consistent memory */ + list_del(&page->page_list); + kfree(page); + } else + pool_free_page(pool, page); + } + + kfree(pool); +} +EXPORT_SYMBOL(dma_pool_destroy); + +/** + * dma_pool_alloc - get a block of consistent memory + * @pool: dma pool that will produce the block + * @mem_flags: GFP_* bitmask + * @handle: pointer to dma address of block + * + * Return: the kernel virtual address of a currently unused block, + * and reports its dma address through the handle. + * If such a memory block can't be allocated, %NULL is returned. + */ +void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, + dma_addr_t *handle) +{ + unsigned long flags; + struct dma_page *page; + size_t offset; + void *retval; + + might_alloc(mem_flags); + + spin_lock_irqsave(&pool->lock, flags); + list_for_each_entry(page, &pool->page_list, page_list) { + if (page->offset < pool->allocation) + goto ready; + } + + /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ + spin_unlock_irqrestore(&pool->lock, flags); + + page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); + if (!page) + return NULL; + + spin_lock_irqsave(&pool->lock, flags); + + list_add(&page->page_list, &pool->page_list); + ready: + page->in_use++; + offset = page->offset; + page->offset = *(int *)(page->vaddr + offset); + retval = offset + page->vaddr; + *handle = offset + page->dma; +#ifdef DMAPOOL_DEBUG + { + int i; + u8 *data = retval; + /* page->offset is stored in first 4 bytes */ + for (i = sizeof(page->offset); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + if (pool->dev) + dev_err(pool->dev, "%s %s, %p (corrupted)\n", + __func__, pool->name, retval); + else + pr_err("%s %s, %p (corrupted)\n", + __func__, pool->name, retval); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + } + if (!(mem_flags & __GFP_ZERO)) + memset(retval, POOL_POISON_ALLOCATED, pool->size); +#endif + spin_unlock_irqrestore(&pool->lock, flags); + + if (want_init_on_alloc(mem_flags)) + memset(retval, 0, pool->size); + + return retval; +} +EXPORT_SYMBOL(dma_pool_alloc); + +static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) +{ + struct dma_page *page; + + list_for_each_entry(page, &pool->page_list, page_list) { + if (dma < page->dma) + continue; + if ((dma - page->dma) < pool->allocation) + return page; + } + return NULL; +} + +/** + * dma_pool_free - put block back into dma pool + * @pool: the dma pool holding the block + * @vaddr: virtual address of block + * @dma: dma address of block + * + * Caller promises neither device nor driver will again touch this block + * unless it is first re-allocated. + */ +void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) +{ + struct dma_page *page; + unsigned long flags; + unsigned int offset; + + spin_lock_irqsave(&pool->lock, flags); + page = pool_find_page(pool, dma); + if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); + if (pool->dev) + dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); + else + pr_err("%s %s, %p/%pad (bad dma)\n", + __func__, pool->name, vaddr, &dma); + return; + } + + offset = vaddr - page->vaddr; + if (want_init_on_free()) + memset(vaddr, 0, pool->size); +#ifdef DMAPOOL_DEBUG + if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); + if (pool->dev) + dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); + else + pr_err("%s %s, %p (bad vaddr)/%pad\n", + __func__, pool->name, vaddr, &dma); + return; + } + { + unsigned int chain = page->offset; + while (chain < pool->allocation) { + if (chain != offset) { + chain = *(int *)(page->vaddr + chain); + continue; + } + spin_unlock_irqrestore(&pool->lock, flags); + if (pool->dev) + dev_err(pool->dev, "%s %s, dma %pad already free\n", + __func__, pool->name, &dma); + else + pr_err("%s %s, dma %pad already free\n", + __func__, pool->name, &dma); + return; + } + } + memset(vaddr, POOL_POISON_FREED, pool->size); +#endif + + page->in_use--; + *(int *)vaddr = page->offset; + page->offset = offset; + /* + * Resist a temptation to do + * if (!is_page_busy(page)) pool_free_page(pool, page); + * Better have a few empty pages hang around. + */ + spin_unlock_irqrestore(&pool->lock, flags); +} +EXPORT_SYMBOL(dma_pool_free); + +/* + * Managed DMA pool + */ +static void dmam_pool_release(struct device *dev, void *res) +{ + struct dma_pool *pool = *(struct dma_pool **)res; + + dma_pool_destroy(pool); +} + +static int dmam_pool_match(struct device *dev, void *res, void *match_data) +{ + return *(struct dma_pool **)res == match_data; +} + +/** + * dmam_pool_create - Managed dma_pool_create() + * @name: name of pool, for diagnostics + * @dev: device that will be doing the DMA + * @size: size of the blocks in this pool. + * @align: alignment requirement for blocks; must be a power of two + * @allocation: returned blocks won't cross this boundary (or zero) + * + * Managed dma_pool_create(). DMA pool created with this function is + * automatically destroyed on driver detach. + * + * Return: a managed dma allocation pool with the requested + * characteristics, or %NULL if one can't be created. + */ +struct dma_pool *dmam_pool_create(const char *name, struct device *dev, + size_t size, size_t align, size_t allocation) +{ + struct dma_pool **ptr, *pool; + + ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL); + if (!ptr) + return NULL; + + pool = *ptr = dma_pool_create(name, dev, size, align, allocation); + if (pool) + devres_add(dev, ptr); + else + devres_free(ptr); + + return pool; +} +EXPORT_SYMBOL(dmam_pool_create); + +/** + * dmam_pool_destroy - Managed dma_pool_destroy() + * @pool: dma pool that will be destroyed + * + * Managed dma_pool_destroy(). + */ +void dmam_pool_destroy(struct dma_pool *pool) +{ + struct device *dev = pool->dev; + + WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); +} +EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 000000000..9bc12e526 --- /dev/null +++ b/mm/early_ioremap.c @@ -0,0 +1,299 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, + pgprot_t prot) +{ + return prot; +} + +void __init early_ioremap_reset(void) +{ + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state >= SYSTEM_RUNNING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n", + __func__, &phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = offset_in_page(phys_addr); + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n", + __func__, &phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%p, %08lx) not found slot\n", + __func__, addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "%s(%p, %08lx) [%d] size not consistent %08lx\n", + __func__, addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n", + __func__, addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = offset_in_page(virt_addr); + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_NORMAL); + + return (__force void *)__early_ioremap(phys_addr, size, prot); +} +#ifdef FIXMAP_PAGE_RO +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, + FIXMAP_PAGE_RO); + + return (__force void *)__early_ioremap(phys_addr, size, prot); +} +#endif + +#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT +void __init * +early_memremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return (__force void *)__early_ioremap(phys_addr, size, + __pgprot(prot_val)); +} +#endif + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +{ + unsigned long slop, clen; + char *p; + + while (size) { + slop = offset_in_page(src); + clen = size; + if (clen > MAX_MAP_CHUNK - slop) + clen = MAX_MAP_CHUNK - slop; + p = early_memremap(src & PAGE_MASK, clen + slop); + memcpy(dest, p + slop, clen); + early_memunmap(p, clen + slop); + dest += clen; + src += clen; + size -= clen; + } +} + +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} diff --git a/mm/fadvise.c b/mm/fadvise.c new file mode 100644 index 000000000..c76ee6653 --- /dev/null +++ b/mm/fadvise.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/fadvise.c + * + * Copyright (C) 2002, Linus Torvalds + * + * 11Jan2003 Andrew Morton + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +/* + * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could + * deactivate the pages and clear PG_Referenced. + */ + +int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + struct inode *inode; + struct address_space *mapping; + struct backing_dev_info *bdi; + loff_t endbyte; /* inclusive */ + pgoff_t start_index; + pgoff_t end_index; + unsigned long nrpages; + + inode = file_inode(file); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + mapping = file->f_mapping; + if (!mapping || len < 0) + return -EINVAL; + + bdi = inode_to_bdi(mapping->host); + + if (IS_DAX(inode) || (bdi == &noop_backing_dev_info)) { + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + case POSIX_FADV_DONTNEED: + /* no bad return value, but ignore advice */ + break; + default: + return -EINVAL; + } + return 0; + } + + /* + * Careful about overflows. Len == 0 means "as much as possible". Use + * unsigned math because signed overflows are undefined and UBSan + * complains. + */ + endbyte = (u64)offset + (u64)len; + if (!len || endbyte < len) + endbyte = -1; + else + endbyte--; /* inclusive */ + + switch (advice) { + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_RANDOM: + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_SEQUENTIAL: + file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); + break; + case POSIX_FADV_WILLNEED: + /* First and last PARTIAL page! */ + start_index = offset >> PAGE_SHIFT; + end_index = endbyte >> PAGE_SHIFT; + + /* Careful about overflow on the "+1" */ + nrpages = end_index - start_index + 1; + if (!nrpages) + nrpages = ~0UL; + + force_page_cache_readahead(mapping, file, start_index, nrpages); + break; + case POSIX_FADV_NOREUSE: + break; + case POSIX_FADV_DONTNEED: + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + + /* + * First and last FULL page! Partial pages are deliberately + * preserved on the expectation that it is better to preserve + * needed memory than to discard unneeded memory. + */ + start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; + end_index = (endbyte >> PAGE_SHIFT); + /* + * The page at end_index will be inclusively discarded according + * by invalidate_mapping_pages(), so subtracting 1 from + * end_index means we will skip the last page. But if endbyte + * is page aligned or is at the end of file, we should not skip + * that page - discarding the last page is safe enough. + */ + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && + endbyte != inode->i_size - 1) { + /* First page is tricky as 0 - 1 = -1, but pgoff_t + * is unsigned, so the end_index >= start_index + * check below would be true and we'll discard the whole + * file cache which is not what was asked. + */ + if (end_index == 0) + break; + + end_index--; + } + + if (end_index >= start_index) { + unsigned long nr_pagevec = 0; + + /* + * It's common to FADV_DONTNEED right after + * the read or write that instantiates the + * pages, in which case there will be some + * sitting on the local LRU cache. Try to + * avoid the expensive remote drain and the + * second cache tree walk below by flushing + * them out right away. + */ + lru_add_drain(); + + invalidate_mapping_pagevec(mapping, + start_index, end_index, + &nr_pagevec); + + /* + * If fewer pages were invalidated than expected then + * it is possible that some of the pages were on + * a per-cpu pagevec for a remote CPU. Drain all + * pagevecs and try again. + */ + if (nr_pagevec) { + lru_add_drain_all(); + invalidate_mapping_pages(mapping, start_index, + end_index); + } + } + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(generic_fadvise); + +int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) +{ + if (file->f_op->fadvise) + return file->f_op->fadvise(file, offset, len, advice); + + return generic_fadvise(file, offset, len, advice); +} +EXPORT_SYMBOL(vfs_fadvise); + +#ifdef CONFIG_ADVISE_SYSCALLS + +int ksys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +{ + struct fd f = fdget(fd); + int ret; + + if (!f.file) + return -EBADF; + + ret = vfs_fadvise(f.file, offset, len, advice); + + fdput(f); + return ret; +} + +SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, offset, len, advice); +} + +#ifdef __ARCH_WANT_SYS_FADVISE64 + +SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice) +{ + return ksys_fadvise64_64(fd, offset, len, advice); +} + +#endif + +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FADVISE64_64) + +COMPAT_SYSCALL_DEFINE6(fadvise64_64, int, fd, compat_arg_u64_dual(offset), + compat_arg_u64_dual(len), int, advice) +{ + return ksys_fadvise64_64(fd, compat_arg_u64_glue(offset), + compat_arg_u64_glue(len), advice); +} + +#endif +#endif diff --git a/mm/failslab.c b/mm/failslab.c new file mode 100644 index 000000000..ffc420c0e --- /dev/null +++ b/mm/failslab.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "slab.h" + +static struct { + struct fault_attr attr; + bool ignore_gfp_reclaim; + bool cache_filter; +} failslab = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_reclaim = true, + .cache_filter = false, +}; + +bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + int flags = 0; + + /* No fault-injection for bootstrap cache */ + if (unlikely(s == kmem_cache)) + return false; + + if (gfpflags & __GFP_NOFAIL) + return false; + + if (failslab.ignore_gfp_reclaim && + (gfpflags & __GFP_DIRECT_RECLAIM)) + return false; + + if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) + return false; + + /* + * In some cases, it expects to specify __GFP_NOWARN + * to avoid printing any information(not just a warning), + * thus avoiding deadlocks. See commit 6b9dbedbe349 for + * details. + */ + if (gfpflags & __GFP_NOWARN) + flags |= FAULT_NOWARN; + + return should_fail_ex(&failslab.attr, s->object_size, flags); +} + +static int __init setup_failslab(char *str) +{ + return setup_fault_attr(&failslab.attr, str); +} +__setup("failslab=", setup_failslab); + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS +static int __init failslab_debugfs_init(void) +{ + struct dentry *dir; + umode_t mode = S_IFREG | 0600; + + dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &failslab.ignore_gfp_reclaim); + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); + + return 0; +} + +late_initcall(failslab_debugfs_init); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ diff --git a/mm/filemap.c b/mm/filemap.c new file mode 100644 index 000000000..2809b1174 --- /dev/null +++ b/mm/filemap.c @@ -0,0 +1,4017 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +/* + * FIXME: remove all knowledge of the buffer layer from the core VM + */ +#include /* for try_to_free_buffers */ + +#include + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli + */ + +/* + * Lock ordering: + * + * ->i_mmap_rwsem (truncate_pagecache) + * ->private_lock (__free_pte->block_dirty_folio) + * ->swap_lock (exclusive_swap_page, others) + * ->i_pages lock + * + * ->i_rwsem + * ->invalidate_lock (acquired by fs in truncate path) + * ->i_mmap_rwsem (truncate->unmap_mapping_range) + * + * ->mmap_lock + * ->i_mmap_rwsem + * ->page_table_lock or pte_lock (various, mainly in memory.c) + * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) + * + * ->mmap_lock + * ->invalidate_lock (filemap_fault) + * ->lock_page (filemap_fault, access_process_vm) + * + * ->i_rwsem (generic_perform_write) + * ->mmap_lock (fault_in_readable->do_page_fault) + * + * bdi->wb.list_lock + * sb_lock (fs/fs-writeback.c) + * ->i_pages lock (__sync_single_inode) + * + * ->i_mmap_rwsem + * ->anon_vma.lock (vma_adjust) + * + * ->anon_vma.lock + * ->page_table_lock or pte_lock (anon_vma_prepare and various) + * + * ->page_table_lock or pte_lock + * ->swap_lock (try_to_unmap_one) + * ->private_lock (try_to_unmap_one) + * ->i_pages lock (try_to_unmap_one) + * ->lruvec->lru_lock (follow_page->mark_page_accessed) + * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) + * ->private_lock (page_remove_rmap->set_page_dirty) + * ->i_pages lock (page_remove_rmap->set_page_dirty) + * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) + * bdi.wb->list_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) + * ->private_lock (zap_pte_range->block_dirty_folio) + * + * ->i_mmap_rwsem + * ->tasklist_lock (memory_failure, collect_procs_ao) + */ + +static void page_cache_delete(struct address_space *mapping, + struct folio *folio, void *shadow) +{ + XA_STATE(xas, &mapping->i_pages, folio->index); + long nr = 1; + + mapping_set_update(&xas, mapping); + + /* hugetlb pages are represented by a single entry in the xarray */ + if (!folio_test_hugetlb(folio)) { + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); + } + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + xas_store(&xas, shadow); + xas_init_marks(&xas); + + folio->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ + mapping->nrpages -= nr; +} + +static void filemap_unaccount_folio(struct address_space *mapping, + struct folio *folio) +{ + long nr; + + VM_BUG_ON_FOLIO(folio_mapped(folio), folio); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { + pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", + current->comm, folio_pfn(folio)); + dump_page(&folio->page, "still mapped when deleted"); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + if (mapping_exiting(mapping) && !folio_test_large(folio)) { + int mapcount = page_mapcount(&folio->page); + + if (folio_ref_count(folio) >= mapcount + 2) { + /* + * All vmas have already been torn down, so it's + * a good bet that actually the page is unmapped + * and we'd rather not leak it: if we're wrong, + * another bad page check should catch it later. + */ + page_mapcount_reset(&folio->page); + folio_ref_sub(folio, mapcount); + } + } + } + + /* hugetlb folios do not participate in page cache accounting. */ + if (folio_test_hugetlb(folio)) + return; + + nr = folio_nr_pages(folio); + + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + } else if (folio_test_pmd_mappable(folio)) { + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + + /* + * At this point folio must be either written or cleaned by + * truncate. Dirty folio here signals a bug and loss of + * unwritten data - on ordinary filesystems. + * + * But it's harmless on in-memory filesystems like tmpfs; and can + * occur when a driver which did get_user_pages() sets page dirty + * before putting it, while the inode is being finally evicted. + * + * Below fixes dirty accounting after removing the folio entirely + * but leaves the dirty flag set: it has no effect for truncated + * folio and anyway will be cleared before returning folio to + * buddy allocator. + */ + if (WARN_ON_ONCE(folio_test_dirty(folio) && + mapping_can_writeback(mapping))) + folio_account_cleaned(folio, inode_to_wb(mapping->host)); +} + +/* + * Delete a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the i_pages lock. + */ +void __filemap_remove_folio(struct folio *folio, void *shadow) +{ + struct address_space *mapping = folio->mapping; + + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); + page_cache_delete(mapping, folio, shadow); +} + +void filemap_free_folio(struct address_space *mapping, struct folio *folio) +{ + void (*free_folio)(struct folio *); + int refs = 1; + + free_folio = mapping->a_ops->free_folio; + if (free_folio) + free_folio(folio); + + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + refs = folio_nr_pages(folio); + folio_put_refs(folio, refs); +} + +/** + * filemap_remove_folio - Remove folio from page cache. + * @folio: The folio. + * + * This must be called only on folios that are locked and have been + * verified to be in the page cache. It will never put the folio into + * the free list because the caller has a reference on the page. + */ +void filemap_remove_folio(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + BUG_ON(!folio_test_locked(folio)); + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + __filemap_remove_folio(folio, NULL); + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + + filemap_free_folio(mapping, folio); +} + +/* + * page_cache_delete_batch - delete several folios from page cache + * @mapping: the mapping to which folios belong + * @fbatch: batch of folios to delete + * + * The function walks over mapping->i_pages and removes folios passed in + * @fbatch from the mapping. The function expects @fbatch to be sorted + * by page index and is optimised for it to be dense. + * It tolerates holes in @fbatch (mapping entries at those indices are not + * modified). + * + * The function expects the i_pages lock to be held. + */ +static void page_cache_delete_batch(struct address_space *mapping, + struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); + long total_pages = 0; + int i = 0; + struct folio *folio; + + mapping_set_update(&xas, mapping); + xas_for_each(&xas, folio, ULONG_MAX) { + if (i >= folio_batch_count(fbatch)) + break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ + if (xa_is_value(folio)) + continue; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (folio != fbatch->folios[i]) { + VM_BUG_ON_FOLIO(folio->index > + fbatch->folios[i]->index, folio); + continue; + } + + WARN_ON_ONCE(!folio_test_locked(folio)); + + folio->mapping = NULL; + /* Leave folio->index set: truncation lookup relies on it */ + + i++; + xas_store(&xas, NULL); + total_pages += folio_nr_pages(folio); + } + mapping->nrpages -= total_pages; +} + +void delete_from_page_cache_batch(struct address_space *mapping, + struct folio_batch *fbatch) +{ + int i; + + if (!folio_batch_count(fbatch)) + return; + + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + + trace_mm_filemap_delete_from_page_cache(folio); + filemap_unaccount_folio(mapping, folio); + } + page_cache_delete_batch(mapping, fbatch); + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + + for (i = 0; i < folio_batch_count(fbatch); i++) + filemap_free_folio(mapping, fbatch->folios[i]); +} + +int filemap_check_errors(struct address_space *mapping) +{ + int ret = 0; + /* Check for outstanding write errors */ + if (test_bit(AS_ENOSPC, &mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &mapping->flags) && + test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + return ret; +} +EXPORT_SYMBOL(filemap_check_errors); + +static int filemap_check_and_keep_errors(struct address_space *mapping) +{ + /* Check for outstanding write errors */ + if (test_bit(AS_EIO, &mapping->flags)) + return -EIO; + if (test_bit(AS_ENOSPC, &mapping->flags)) + return -ENOSPC; + return 0; +} + +/** + * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @wbc: the writeback_control controlling the writeout + * + * Call writepages on the mapping using the provided wbc to control the + * writeout. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_fdatawrite_wbc(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret; + + if (!mapping_can_writeback(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + wbc_attach_fdatawrite_inode(wbc, mapping->host); + ret = do_writepages(mapping, wbc); + wbc_detach_inode(wbc); + return ret; +} +EXPORT_SYMBOL(filemap_fdatawrite_wbc); + +/** + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + * + * Return: %0 on success, negative error code otherwise. + */ +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; + + return filemap_fdatawrite_wbc(mapping, &wbc); +} + +static inline int __filemap_fdatawrite(struct address_space *mapping, + int sync_mode) +{ + return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); +} + +int filemap_fdatawrite(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite); + +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); +} +EXPORT_SYMBOL(filemap_fdatawrite_range); + +/** + * filemap_flush - mostly a non-blocking flush + * @mapping: target address_space + * + * This is a mostly non-blocking flush. Not suitable for data-integrity + * purposes - I/O may not be started against all dirty pages. + * + * Return: %0 on success, negative error code otherwise. + */ +int filemap_flush(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_NONE); +} +EXPORT_SYMBOL(filemap_flush); + +/** + * filemap_range_has_page - check if a page exists in range. + * @mapping: address space within which to check + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Find at least one page in the range supplied, usually used to check if + * direct writing in this range will trigger a writeback. + * + * Return: %true if at least one page exists in the specified range, + * %false otherwise. + */ +bool filemap_range_has_page(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); + + return page != NULL; +} +EXPORT_SYMBOL(filemap_range_has_page); + +static void __filemap_fdatawait_range(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; + struct pagevec pvec; + int nr_pages; + + if (end_byte < start_byte) + return; + + pagevec_init(&pvec); + while (index <= end) { + unsigned i; + + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK); + if (!nr_pages) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + wait_on_page_writeback(page); + ClearPageError(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +/** + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. Check error status of + * the address space and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + * + * Return: error status of the address space. + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) +{ + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return filemap_check_errors(mapping); +} +EXPORT_SYMBOL(filemap_fdatawait_range); + +/** + * filemap_fdatawait_range_keep_errors - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space in the + * given range and wait for all of them. Unlike filemap_fdatawait_range(), + * this function does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +int filemap_fdatawait_range_keep_errors(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return filemap_check_and_keep_errors(mapping); +} +EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); + +/** + * file_fdatawait_range - wait for writeback to complete + * @file: file pointing to address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the address space that file + * refers to, in the given range and wait for all of them. Check error + * status of the address space vs. the file->f_wb_err cursor and return it. + * + * Since the error status of the file is advanced by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + * + * Return: error status of the address space vs. the file->f_wb_err cursor. + */ +int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) +{ + struct address_space *mapping = file->f_mapping; + + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return file_check_and_advance_wb_err(file); +} +EXPORT_SYMBOL(file_fdatawait_range); + +/** + * filemap_fdatawait_keep_errors - wait for writeback without clearing errors + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. Unlike filemap_fdatawait(), this function + * does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + * + * Return: error status of the address space. + */ +int filemap_fdatawait_keep_errors(struct address_space *mapping) +{ + __filemap_fdatawait_range(mapping, 0, LLONG_MAX); + return filemap_check_and_keep_errors(mapping); +} +EXPORT_SYMBOL(filemap_fdatawait_keep_errors); + +/* Returns true if writeback might be needed or already in progress. */ +static bool mapping_needs_writeback(struct address_space *mapping) +{ + return mapping->nrpages; +} + +bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct folio *folio; + + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) + continue; + if (xa_is_value(folio)) + continue; + if (folio_test_dirty(folio) || folio_test_locked(folio) || + folio_test_writeback(folio)) + break; + } + rcu_read_unlock(); + return folio != NULL; +} +EXPORT_SYMBOL_GPL(filemap_range_has_writeback); + +/** + * filemap_write_and_wait_range - write out & wait on a file range + * @mapping: the address_space for the pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that @lend is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + * + * Return: error status of the address space. + */ +int filemap_write_and_wait_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + int err = 0, err2; + + if (mapping_needs_writeback(mapping)) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); + } + err2 = filemap_check_errors(mapping); + if (!err) + err = err2; + return err; +} +EXPORT_SYMBOL(filemap_write_and_wait_range); + +void __filemap_set_wb_err(struct address_space *mapping, int err) +{ + errseq_t eseq = errseq_set(&mapping->wb_err, err); + + trace_filemap_set_wb_err(mapping, eseq); +} +EXPORT_SYMBOL(__filemap_set_wb_err); + +/** + * file_check_and_advance_wb_err - report wb error (if any) that was previously + * and advance wb_err to current one + * @file: struct file on which the error is being reported + * + * When userland calls fsync (or something like nfsd does the equivalent), we + * want to report any writeback errors that occurred since the last fsync (or + * since the file was opened if there haven't been any). + * + * Grab the wb_err from the mapping. If it matches what we have in the file, + * then just quickly return 0. The file is all caught up. + * + * If it doesn't match, then take the mapping value, set the "seen" flag in + * it and try to swap it into place. If it works, or another task beat us + * to it with the new value, then update the f_wb_err and return the error + * portion. The error at this point must be reported via proper channels + * (a'la fsync, or NFS COMMIT operation, etc.). + * + * While we handle mapping->wb_err with atomic operations, the f_wb_err + * value is protected by the f_lock since we must ensure that it reflects + * the latest value swapped in for this file descriptor. + * + * Return: %0 on success, negative error code otherwise. + */ +int file_check_and_advance_wb_err(struct file *file) +{ + int err = 0; + errseq_t old = READ_ONCE(file->f_wb_err); + struct address_space *mapping = file->f_mapping; + + /* Locklessly handle the common case where nothing has changed */ + if (errseq_check(&mapping->wb_err, old)) { + /* Something changed, must use slow path */ + spin_lock(&file->f_lock); + old = file->f_wb_err; + err = errseq_check_and_advance(&mapping->wb_err, + &file->f_wb_err); + trace_file_check_and_advance_wb_err(file, old); + spin_unlock(&file->f_lock); + } + + /* + * We're mostly using this function as a drop in replacement for + * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect + * that the legacy code would have had on these flags. + */ + clear_bit(AS_EIO, &mapping->flags); + clear_bit(AS_ENOSPC, &mapping->flags); + return err; +} +EXPORT_SYMBOL(file_check_and_advance_wb_err); + +/** + * file_write_and_wait_range - write out & wait on a file range + * @file: file pointing to address_space with pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that @lend is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + * + * After writing out and waiting on the data, we check and advance the + * f_wb_err cursor to the latest value, and return any errors detected there. + * + * Return: %0 on success, negative error code otherwise. + */ +int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) +{ + int err = 0, err2; + struct address_space *mapping = file->f_mapping; + + if (mapping_needs_writeback(mapping)) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); + } + err2 = file_check_and_advance_wb_err(file); + if (!err) + err = err2; + return err; +} +EXPORT_SYMBOL(file_write_and_wait_range); + +/** + * replace_page_cache_page - replace a pagecache page with a new one + * @old: page to be replaced + * @new: page to replace with + * + * This function replaces a page in the pagecache with a new one. On + * success it acquires the pagecache reference for the new page and + * drops it for the old page. Both the old and new pages must be + * locked. This function does not add the new page to the LRU, the + * caller must do that. + * + * The remove + add is atomic. This function cannot fail. + */ +void replace_page_cache_page(struct page *old, struct page *new) +{ + struct folio *fold = page_folio(old); + struct folio *fnew = page_folio(new); + struct address_space *mapping = old->mapping; + void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + + VM_BUG_ON_PAGE(!PageLocked(old), old); + VM_BUG_ON_PAGE(!PageLocked(new), new); + VM_BUG_ON_PAGE(new->mapping, new); + + get_page(new); + new->mapping = mapping; + new->index = offset; + + mem_cgroup_migrate(fold, fnew); + + xas_lock_irq(&xas); + xas_store(&xas, new); + + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_lruvec_page_state(old, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_lruvec_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_lruvec_page_state(old, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_lruvec_page_state(new, NR_SHMEM); + xas_unlock_irq(&xas); + if (free_folio) + free_folio(fold); + folio_put(fold); +} +EXPORT_SYMBOL_GPL(replace_page_cache_page); + +noinline int __filemap_add_folio(struct address_space *mapping, + struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) +{ + XA_STATE(xas, &mapping->i_pages, index); + int huge = folio_test_hugetlb(folio); + bool charged = false; + long nr = 1; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); + mapping_set_update(&xas, mapping); + + if (!huge) { + int error = mem_cgroup_charge(folio, NULL, gfp); + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); + if (error) + return error; + charged = true; + xas_set_order(&xas, index, folio_order(folio)); + nr = folio_nr_pages(folio); + } + + gfp &= GFP_RECLAIM_MASK; + folio_ref_add(folio, nr); + folio->mapping = mapping; + folio->index = xas.xa_index; + + do { + unsigned int order = xa_get_order(xas.xa, xas.xa_index); + void *entry, *old = NULL; + + if (order > folio_order(folio)) + xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), + order, gfp); + xas_lock_irq(&xas); + xas_for_each_conflict(&xas, entry) { + old = entry; + if (!xa_is_value(entry)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + } + + if (old) { + if (shadowp) + *shadowp = old; + /* entry may have been split before we acquired lock */ + order = xa_get_order(xas.xa, xas.xa_index); + if (order > folio_order(folio)) { + /* How to handle large swap entries? */ + BUG_ON(shmem_mapping(mapping)); + xas_split(&xas, old, order); + xas_reset(&xas); + } + } + + xas_store(&xas, folio); + if (xas_error(&xas)) + goto unlock; + + mapping->nrpages += nr; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) { + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + if (folio_test_pmd_mappable(folio)) + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, nr); + } +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) + goto error; + + trace_mm_filemap_add_to_page_cache(folio); + return 0; +error: + if (charged) + mem_cgroup_uncharge(folio); + folio->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ + folio_put_refs(folio, nr); + return xas_error(&xas); +} +ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO); + +int filemap_add_folio(struct address_space *mapping, struct folio *folio, + pgoff_t index, gfp_t gfp) +{ + void *shadow = NULL; + int ret; + + __folio_set_locked(folio); + ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow); + if (unlikely(ret)) + __folio_clear_locked(folio); + else { + /* + * The folio might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed folio. + * The exception is folios getting rewritten; evicting other + * data from the working set, only to cache data that will + * get overwritten with something else, is a waste of memory. + */ + WARN_ON_ONCE(folio_test_active(folio)); + if (!(gfp & __GFP_WRITE) && shadow) + workingset_refault(folio, shadow); + folio_add_lru(folio); + } + return ret; +} +EXPORT_SYMBOL_GPL(filemap_add_folio); + +#ifdef CONFIG_NUMA +struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +{ + int n; + struct folio *folio; + + if (cpuset_do_page_mem_spread()) { + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + n = cpuset_mem_spread_node(); + folio = __folio_alloc_node(gfp, order, n); + } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie)); + + return folio; + } + return folio_alloc(gfp, order); +} +EXPORT_SYMBOL(filemap_alloc_folio); +#endif + +/* + * filemap_invalidate_lock_two - lock invalidate_lock for two mappings + * + * Lock exclusively invalidate_lock of any passed mapping that is not NULL. + * + * @mapping1: the first mapping to lock + * @mapping2: the second mapping to lock + */ +void filemap_invalidate_lock_two(struct address_space *mapping1, + struct address_space *mapping2) +{ + if (mapping1 > mapping2) + swap(mapping1, mapping2); + if (mapping1) + down_write(&mapping1->invalidate_lock); + if (mapping2 && mapping1 != mapping2) + down_write_nested(&mapping2->invalidate_lock, 1); +} +EXPORT_SYMBOL(filemap_invalidate_lock_two); + +/* + * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings + * + * Unlock exclusive invalidate_lock of any passed mapping that is not NULL. + * + * @mapping1: the first mapping to unlock + * @mapping2: the second mapping to unlock + */ +void filemap_invalidate_unlock_two(struct address_space *mapping1, + struct address_space *mapping2) +{ + if (mapping1) + up_write(&mapping1->invalidate_lock); + if (mapping2 && mapping1 != mapping2) + up_write(&mapping2->invalidate_lock); +} +EXPORT_SYMBOL(filemap_invalidate_unlock_two); + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +#define PAGE_WAIT_TABLE_BITS 8 +#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS) +static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned; + +static wait_queue_head_t *folio_waitqueue(struct folio *folio) +{ + return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)]; +} + +void __init pagecache_init(void) +{ + int i; + + for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(&folio_wait_table[i]); + + page_writeback_init(); +} + +/* + * The page wait code treats the "wait->flags" somewhat unusually, because + * we have multiple different kinds of waits, not just the usual "exclusive" + * one. + * + * We have: + * + * (a) no special bits set: + * + * We're just waiting for the bit to be released, and when a waker + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, + * and remove it from the wait queue. + * + * Simple and straightforward. + * + * (b) WQ_FLAG_EXCLUSIVE: + * + * The waiter is waiting to get the lock, and only one waiter should + * be woken up to avoid any thundering herd behavior. We'll set the + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. + * + * This is the traditional exclusive wait. + * + * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * + * The waiter is waiting to get the bit, and additionally wants the + * lock to be transferred to it for fair lock behavior. If the lock + * cannot be taken, we stop walking the wait queue without waking + * the waiter. + * + * This is the "fair lock handoff" case, and in addition to setting + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see + * that it now has the lock. + */ +static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) +{ + unsigned int flags; + struct wait_page_key *key = arg; + struct wait_page_queue *wait_page + = container_of(wait, struct wait_page_queue, wait); + + if (!wake_page_match(wait_page, key)) + return 0; + + /* + * If it's a lock handoff wait, we get the bit for it, and + * stop walking (and do not wake it up) if we can't. + */ + flags = wait->flags; + if (flags & WQ_FLAG_EXCLUSIVE) { + if (test_bit(key->bit_nr, &key->folio->flags)) + return -1; + if (flags & WQ_FLAG_CUSTOM) { + if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + return -1; + flags |= WQ_FLAG_DONE; + } + } + + /* + * We are holding the wait-queue lock, but the waiter that + * is waiting for this will be checking the flags without + * any locking. + * + * So update the flags atomically, and wake up the waiter + * afterwards to avoid any races. This store-release pairs + * with the load-acquire in folio_wait_bit_common(). + */ + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); + wake_up_state(wait->private, mode); + + /* + * Ok, we have successfully done what we're waiting for, + * and we can unconditionally remove the wait entry. + * + * Note that this pairs with the "finish_wait()" in the + * waiter, and has to be the absolute last thing we do. + * After this list_del_init(&wait->entry) the wait entry + * might be de-allocated and the process might even have + * exited. + */ + list_del_init_careful(&wait->entry); + return (flags & WQ_FLAG_EXCLUSIVE) != 0; +} + +static void folio_wake_bit(struct folio *folio, int bit_nr) +{ + wait_queue_head_t *q = folio_waitqueue(folio); + struct wait_page_key key; + unsigned long flags; + wait_queue_entry_t bookmark; + + key.folio = folio; + key.bit_nr = bit_nr; + key.page_match = 0; + + bookmark.flags = 0; + bookmark.private = NULL; + bookmark.func = NULL; + INIT_LIST_HEAD(&bookmark.entry); + + spin_lock_irqsave(&q->lock, flags); + __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); + + while (bookmark.flags & WQ_FLAG_BOOKMARK) { + /* + * Take a breather from holding the lock, + * allow pages that finish wake up asynchronously + * to acquire the lock and remove themselves + * from wait queue + */ + spin_unlock_irqrestore(&q->lock, flags); + cpu_relax(); + spin_lock_irqsave(&q->lock, flags); + __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); + } + + /* + * It's possible to miss clearing waiters here, when we woke our page + * waiters, but the hashed waitqueue has waiters for other pages on it. + * That's okay, it's a rare case. The next waker will clear it. + * + * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE, + * other), the flag may be cleared in the course of freeing the page; + * but that is not required for correctness. + */ + if (!waitqueue_active(q) || !key.page_match) + folio_clear_waiters(folio); + + spin_unlock_irqrestore(&q->lock, flags); +} + +static void folio_wake(struct folio *folio, int bit) +{ + if (!folio_test_waiters(folio)) + return; + folio_wake_bit(folio, bit); +} + +/* + * A choice of three behaviors for folio_wait_bit_common(): + */ +enum behavior { + EXCLUSIVE, /* Hold ref to page and take the bit when woken, like + * __folio_lock() waiting on then setting PG_locked. + */ + SHARED, /* Hold ref to page and check the bit when woken, like + * folio_wait_writeback() waiting on PG_writeback. + */ + DROP, /* Drop ref to page before wait, no check when woken, + * like folio_put_wait_locked() on PG_locked. + */ +}; + +/* + * Attempt to check (or get) the folio flag, and mark us done + * if successful. + */ +static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, + struct wait_queue_entry *wait) +{ + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(bit_nr, &folio->flags)) + return false; + } else if (test_bit(bit_nr, &folio->flags)) + return false; + + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; + return true; +} + +/* How many times do we accept lock stealing from under a waiter? */ +int sysctl_page_lock_unfairness = 5; + +static inline int folio_wait_bit_common(struct folio *folio, int bit_nr, + int state, enum behavior behavior) +{ + wait_queue_head_t *q = folio_waitqueue(folio); + int unfairness = sysctl_page_lock_unfairness; + struct wait_page_queue wait_page; + wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + unsigned long pflags; + bool in_thrashing; + + if (bit_nr == PG_locked && + !folio_test_uptodate(folio) && folio_test_workingset(folio)) { + delayacct_thrashing_start(&in_thrashing); + psi_memstall_enter(&pflags); + thrashing = true; + } + + init_wait(wait); + wait->func = wake_page_function; + wait_page.folio = folio; + wait_page.bit_nr = bit_nr; + +repeat: + wait->flags = 0; + if (behavior == EXCLUSIVE) { + wait->flags = WQ_FLAG_EXCLUSIVE; + if (--unfairness < 0) + wait->flags |= WQ_FLAG_CUSTOM; + } + + /* + * Do one last check whether we can get the + * page bit synchronously. + * + * Do the folio_set_waiters() marking before that + * to let any waker we _just_ missed know they + * need to wake us up (otherwise they'll never + * even go to the slow case that looks at the + * page queue), and add ourselves to the wait + * queue if we need to sleep. + * + * This part needs to be done under the queue + * lock to avoid races. + */ + spin_lock_irq(&q->lock); + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, bit_nr, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); + + /* + * From now on, all the logic will be based on + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to + * see whether the page bit testing has already + * been done by the wake function. + * + * We can drop our reference to the folio. + */ + if (behavior == DROP) + folio_put(folio); + + /* + * Note that until the "finish_wait()", or until + * we see the WQ_FLAG_WOKEN flag, we need to + * be very careful with the 'wait->flags', because + * we may race with a waker that sets them. + */ + for (;;) { + unsigned int flags; + + set_current_state(state); + + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(state, current)) + break; + + io_schedule(); + continue; + } + + /* If we were non-exclusive, we're done */ + if (behavior != EXCLUSIVE) + break; + + /* If the waker got the lock for us, we're done */ + if (flags & WQ_FLAG_DONE) + break; + + /* + * Otherwise, if we're getting the lock, we need to + * try to get it ourselves. + * + * And if that fails, we'll have to retry this all. + */ + if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0)))) + goto repeat; + + wait->flags |= WQ_FLAG_DONE; + break; + } + + /* + * If a signal happened, this 'finish_wait()' may remove the last + * waiter from the wait-queues, but the folio waiters bit will remain + * set. That's ok. The next wakeup will take care of it, and trying + * to do it here would be difficult and prone to races. + */ + finish_wait(q, wait); + + if (thrashing) { + delayacct_thrashing_end(&in_thrashing); + psi_memstall_leave(&pflags); + } + + /* + * NOTE! The wait->flags weren't stable until we've done the + * 'finish_wait()', and we could have exited the loop above due + * to a signal, and had a wakeup event happen after the signal + * test but before the 'finish_wait()'. + * + * So only after the finish_wait() can we reliably determine + * if we got woken up or not, so we can now figure out the final + * return value based on that state without races. + * + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive + * waiter, but an exclusive one requires WQ_FLAG_DONE. + */ + if (behavior == EXCLUSIVE) + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; + + return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; +} + +#ifdef CONFIG_MIGRATION +/** + * migration_entry_wait_on_locked - Wait for a migration entry to be removed + * @entry: migration swap entry. + * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required + * for pte entries, pass NULL for pmd entries. + * @ptl: already locked ptl. This function will drop the lock. + * + * Wait for a migration entry referencing the given page to be removed. This is + * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * this can be called without taking a reference on the page. Instead this + * should be called while holding the ptl for the migration entry referencing + * the page. + * + * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * + * This follows the same logic as folio_wait_bit_common() so see the comments + * there. + */ +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl) +{ + struct wait_page_queue wait_page; + wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + unsigned long pflags; + bool in_thrashing; + wait_queue_head_t *q; + struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + + q = folio_waitqueue(folio); + if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { + delayacct_thrashing_start(&in_thrashing); + psi_memstall_enter(&pflags); + thrashing = true; + } + + init_wait(wait); + wait->func = wake_page_function; + wait_page.folio = folio; + wait_page.bit_nr = PG_locked; + wait->flags = 0; + + spin_lock_irq(&q->lock); + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, PG_locked, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); + + /* + * If a migration entry exists for the page the migration path must hold + * a valid reference to the page, and it must take the ptl to remove the + * migration entry. So the page is valid until the ptl is dropped. + */ + if (ptep) + pte_unmap_unlock(ptep, ptl); + else + spin_unlock(ptl); + + for (;;) { + unsigned int flags; + + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) + break; + + io_schedule(); + continue; + } + break; + } + + finish_wait(q, wait); + + if (thrashing) { + delayacct_thrashing_end(&in_thrashing); + psi_memstall_leave(&pflags); + } +} +#endif + +void folio_wait_bit(struct folio *folio, int bit_nr) +{ + folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); +} +EXPORT_SYMBOL(folio_wait_bit); + +int folio_wait_bit_killable(struct folio *folio, int bit_nr) +{ + return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED); +} +EXPORT_SYMBOL(folio_wait_bit_killable); + +/** + * folio_put_wait_locked - Drop a reference and wait for it to be unlocked + * @folio: The folio to wait for. + * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). + * + * The caller should hold a reference on @folio. They expect the page to + * become unlocked relatively soon, but do not wish to hold up migration + * (for example) by holding the reference while waiting for the folio to + * come unlocked. After this function returns, the caller should not + * dereference @folio. + * + * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. + */ +static int folio_put_wait_locked(struct folio *folio, int state) +{ + return folio_wait_bit_common(folio, PG_locked, state, DROP); +} + +/** + * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue + * @folio: Folio defining the wait queue of interest + * @waiter: Waiter to add to the queue + * + * Add an arbitrary @waiter to the wait queue for the nominated @folio. + */ +void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) +{ + wait_queue_head_t *q = folio_waitqueue(folio); + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_entry_tail(q, waiter); + folio_set_waiters(folio); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(folio_add_wait_queue); + +#ifndef clear_bit_unlock_is_negative_byte + +/* + * PG_waiters is the high bit in the same byte as PG_lock. + * + * On x86 (and on many other architectures), we can clear PG_lock and + * test the sign bit at the same time. But if the architecture does + * not support that special operation, we just do this all by hand + * instead. + * + * The read of PG_waiters has to be after (or concurrently with) PG_locked + * being cleared, but a memory barrier should be unnecessary since it is + * in the same byte as PG_locked. + */ +static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) +{ + clear_bit_unlock(nr, mem); + /* smp_mb__after_atomic(); */ + return test_bit(PG_waiters, mem); +} + +#endif + +/** + * folio_unlock - Unlock a locked folio. + * @folio: The folio. + * + * Unlocks the folio and wakes up any thread sleeping on the page lock. + * + * Context: May be called from interrupt or process context. May not be + * called from NMI context. + */ +void folio_unlock(struct folio *folio) +{ + /* Bit 7 allows x86 to check the byte's sign bit */ + BUILD_BUG_ON(PG_waiters != 7); + BUILD_BUG_ON(PG_locked > 7); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) + folio_wake_bit(folio, PG_locked); +} +EXPORT_SYMBOL(folio_unlock); + +/** + * folio_end_private_2 - Clear PG_private_2 and wake any waiters. + * @folio: The folio. + * + * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for + * it. The folio reference held for PG_private_2 being set is released. + * + * This is, for example, used when a netfs folio is being written to a local + * disk cache, thereby allowing writes to the cache for the same folio to be + * serialised. + */ +void folio_end_private_2(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio); + clear_bit_unlock(PG_private_2, folio_flags(folio, 0)); + folio_wake_bit(folio, PG_private_2); + folio_put(folio); +} +EXPORT_SYMBOL(folio_end_private_2); + +/** + * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio. + * @folio: The folio to wait on. + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio. + */ +void folio_wait_private_2(struct folio *folio) +{ + while (folio_test_private_2(folio)) + folio_wait_bit(folio, PG_private_2); +} +EXPORT_SYMBOL(folio_wait_private_2); + +/** + * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio. + * @folio: The folio to wait on. + * + * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a + * fatal signal is received by the calling task. + * + * Return: + * - 0 if successful. + * - -EINTR if a fatal signal was encountered. + */ +int folio_wait_private_2_killable(struct folio *folio) +{ + int ret = 0; + + while (folio_test_private_2(folio)) { + ret = folio_wait_bit_killable(folio, PG_private_2); + if (ret < 0) + break; + } + + return ret; +} +EXPORT_SYMBOL(folio_wait_private_2_killable); + +/** + * folio_end_writeback - End writeback against a folio. + * @folio: The folio. + */ +void folio_end_writeback(struct folio *folio) +{ + /* + * folio_test_clear_reclaim() could be used here but it is an + * atomic operation and overkill in this particular case. Failing + * to shuffle a folio marked for immediate reclaim is too mild + * a gain to justify taking an atomic operation penalty at the + * end of every folio writeback. + */ + if (folio_test_reclaim(folio)) { + folio_clear_reclaim(folio); + folio_rotate_reclaimable(folio); + } + + /* + * Writeback does not hold a folio reference of its own, relying + * on truncation to wait for the clearing of PG_writeback. + * But here we must make sure that the folio is not freed and + * reused before the folio_wake(). + */ + folio_get(folio); + if (!__folio_end_writeback(folio)) + BUG(); + + smp_mb__after_atomic(); + folio_wake(folio, PG_writeback); + acct_reclaim_writeback(folio); + folio_put(folio); +} +EXPORT_SYMBOL(folio_end_writeback); + +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, bool is_write, int err) +{ + struct folio *folio = page_folio(page); + + if (!is_write) { + if (!err) { + folio_mark_uptodate(folio); + } else { + folio_clear_uptodate(folio); + folio_set_error(folio); + } + folio_unlock(folio); + } else { + if (err) { + struct address_space *mapping; + + folio_set_error(folio); + mapping = folio_mapping(folio); + if (mapping) + mapping_set_error(mapping, err); + } + folio_end_writeback(folio); + } +} +EXPORT_SYMBOL_GPL(page_endio); + +/** + * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it. + * @folio: The folio to lock + */ +void __folio_lock(struct folio *folio) +{ + folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE, + EXCLUSIVE); +} +EXPORT_SYMBOL(__folio_lock); + +int __folio_lock_killable(struct folio *folio) +{ + return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE, + EXCLUSIVE); +} +EXPORT_SYMBOL_GPL(__folio_lock_killable); + +static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) +{ + struct wait_queue_head *q = folio_waitqueue(folio); + int ret = 0; + + wait->folio = folio; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + folio_set_waiters(folio); + ret = !folio_trylock(folio); + /* + * If we were successful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; +} + +/* + * Return values: + * true - folio is locked; mmap_lock is still held. + * false - folio is not locked. + * mmap_lock has been released (mmap_read_unlock(), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_lock is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return true + * with the folio locked and the mmap_lock unperturbed. + */ +bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, + unsigned int flags) +{ + if (fault_flag_allow_retry_first(flags)) { + /* + * CAUTION! In this case, mmap_lock is not released + * even though return 0. + */ + if (flags & FAULT_FLAG_RETRY_NOWAIT) + return false; + + mmap_read_unlock(mm); + if (flags & FAULT_FLAG_KILLABLE) + folio_wait_locked_killable(folio); + else + folio_wait_locked(folio); + return false; + } + if (flags & FAULT_FLAG_KILLABLE) { + bool ret; + + ret = __folio_lock_killable(folio); + if (ret) { + mmap_read_unlock(mm); + return false; + } + } else { + __folio_lock(folio); + } + + return true; +} + +/** + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. + */ +pgoff_t page_cache_next_miss(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + XA_STATE(xas, &mapping->i_pages, index); + + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) + break; + if (xas.xa_index == 0) + break; + } + + return xas.xa_index; +} +EXPORT_SYMBOL(page_cache_next_miss); + +/** + * page_cache_prev_miss() - Find the previous gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. + */ +pgoff_t page_cache_prev_miss(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + XA_STATE(xas, &mapping->i_pages, index); + + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) + break; + if (xas.xa_index == ULONG_MAX) + break; + } + + return xas.xa_index; +} +EXPORT_SYMBOL(page_cache_prev_miss); + +/* + * Lockless page cache protocol: + * On the lookup side: + * 1. Load the folio from i_pages + * 2. Increment the refcount if it's not zero + * 3. If the folio is not found by xas_reload(), put the refcount and retry + * + * On the removal side: + * A. Freeze the page (by zeroing the refcount if nobody else has a reference) + * B. Remove the page from i_pages + * C. Return the page to the page allocator + * + * This means that any page may have its reference count temporarily + * increased by a speculative page cache (or fast GUP) lookup as it can + * be allocated by another user before the RCU grace period expires. + * Because the refcount temporarily acquired here may end up being the + * last refcount on the page, any page allocation must be freeable by + * folio_put(). + */ + +/* + * mapping_get_entry - Get a page cache entry. + * @mapping: the address_space to search + * @index: The page cache index. + * + * Looks up the page cache entry at @mapping & @index. If it is a folio, + * it is returned with an increased refcount. If it is a shadow entry + * of a previously evicted folio, or a swap entry from shmem/tmpfs, + * it is returned without further action. + * + * Return: The folio, swap or shadow entry, %NULL if nothing is found. + */ +static void *mapping_get_entry(struct address_space *mapping, pgoff_t index) +{ + XA_STATE(xas, &mapping->i_pages, index); + struct folio *folio; + + rcu_read_lock(); +repeat: + xas_reset(&xas); + folio = xas_load(&xas); + if (xas_retry(&xas, folio)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!folio || xa_is_value(folio)) + goto out; + + if (!folio_try_get_rcu(folio)) + goto repeat; + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + goto repeat; + } +out: + rcu_read_unlock(); + + return folio; +} + +/** + * __filemap_get_folio - Find and get a reference to a folio. + * @mapping: The address_space to search. + * @index: The page index. + * @fgp_flags: %FGP flags modify how the folio is returned. + * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * + * Looks up the page cache entry at @mapping & @index. + * + * @fgp_flags can be zero or more of these flags: + * + * * %FGP_ACCESSED - The folio will be marked accessed. + * * %FGP_LOCK - The folio is returned locked. + * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it + * instead of allocating a new folio to replace it. + * * %FGP_CREAT - If no page is present then a new page is allocated using + * @gfp and added to the page cache and the VM's LRU list. + * The page is returned locked and with an increased refcount. + * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the + * page is already in cache. If the page was allocated, unlock it before + * returning so the caller can do the same dance. + * * %FGP_WRITE - The page will be written to by the caller. + * * %FGP_NOFS - __GFP_FS will get cleared in gfp. + * * %FGP_NOWAIT - Don't get blocked by page lock. + * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * + * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even + * if the %GFP flags specified for %FGP_CREAT are atomic. + * + * If there is a page cache page, it is returned with an increased refcount. + * + * Return: The found folio or %NULL otherwise. + */ +struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp) +{ + struct folio *folio; + +repeat: + folio = mapping_get_entry(mapping, index); + if (xa_is_value(folio)) { + if (fgp_flags & FGP_ENTRY) + return folio; + folio = NULL; + } + if (!folio) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!folio_trylock(folio)) { + folio_put(folio); + return NULL; + } + } else { + folio_lock(folio); + } + + /* Has the page been truncated? */ + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); + goto repeat; + } + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + } + + if (fgp_flags & FGP_ACCESSED) + folio_mark_accessed(folio); + else if (fgp_flags & FGP_WRITE) { + /* Clear idle flag for buffer write */ + if (folio_test_idle(folio)) + folio_clear_idle(folio); + } + + if (fgp_flags & FGP_STABLE) + folio_wait_stable(folio); +no_page: + if (!folio && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) + gfp |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) + gfp &= ~__GFP_FS; + if (fgp_flags & FGP_NOWAIT) { + gfp &= ~GFP_KERNEL; + gfp |= GFP_NOWAIT | __GFP_NOWARN; + } + + folio = filemap_alloc_folio(gfp, 0); + if (!folio) + return NULL; + + if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoid atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + __folio_set_referenced(folio); + + err = filemap_add_folio(mapping, folio, index, gfp); + if (unlikely(err)) { + folio_put(folio); + folio = NULL; + if (err == -EEXIST) + goto repeat; + } + + /* + * filemap_add_folio locks the page, and for mmap + * we expect an unlocked page. + */ + if (folio && (fgp_flags & FGP_FOR_MMAP)) + folio_unlock(folio); + } + + return folio; +} +EXPORT_SYMBOL(__filemap_get_folio); + +static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, + xa_mark_t mark) +{ + struct folio *folio; + +retry: + if (mark == XA_PRESENT) + folio = xas_find(xas, max); + else + folio = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, folio)) + goto retry; + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (!folio || xa_is_value(folio)) + return folio; + + if (!folio_try_get_rcu(folio)) + goto reset; + + if (unlikely(folio != xas_reload(xas))) { + folio_put(folio); + goto reset; + } + + return folio; +reset: + xas_reset(xas); + goto retry; +} + +/** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @end: The final page index (inclusive). + * @fbatch: Where the resulting entries are placed. + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a batch of entries in + * the mapping. The entries are placed in @fbatch. find_get_entries() + * takes a reference on any actual folios it returns. + * + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries or large folios. + * + * Any shadow entries of evicted folios, or swap entries from + * shmem/tmpfs, are included in the returned array. + * + * Return: The number of entries which were found. + */ +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) + break; + } + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + +/** + * find_lock_entries - Find a batch of pagecache entries. + * @mapping: The address_space to search. + * @start: The starting page cache index. + * @end: The final page index (inclusive). + * @fbatch: Where the resulting entries are placed. + * @indices: The cache indices of the entries in @fbatch. + * + * find_lock_entries() will return a batch of entries from @mapping. + * Swap, shadow and DAX entries are included. Folios are returned + * locked and with an incremented refcount. Folios which are locked + * by somebody else or under writeback are skipped. Folios which are + * partially outside the range are not returned. + * + * The entries have ascending indexes. The indices may not be consecutive + * due to not-present entries, large folios, folios which could not be + * locked or folios under writeback. + * + * Return: The number of entries which were found. + */ +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { + if (!xa_is_value(folio)) { + if (folio->index < start) + goto put; + if (folio->index + folio_nr_pages(folio) - 1 > end) + goto put; + if (!folio_trylock(folio)) + goto put; + if (folio->mapping != mapping || + folio_test_writeback(folio)) + goto unlock; + VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index), + folio); + } + indices[fbatch->nr] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) + break; + continue; +unlock: + folio_unlock(folio); +put: + folio_put(folio); + } + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + +/** + * filemap_get_folios - Get a batch of folios + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @fbatch: The batch to fill. + * + * Search for and return a batch of folios in the mapping starting at + * index @start and up to index @end (inclusive). The folios are returned + * in @fbatch with an elevated reference count. + * + * The first folio may start before @start; if it does, it will contain + * @start. The final folio may extend beyond @end; if it does, it will + * contain @end. The folios have ascending indices. There may be gaps + * between the folios if there are indices which have no folio in the + * page cache. If folios are added to or removed from the page cache + * while this is running, they may or may not be found by this call. + * + * Return: The number of folios which were found. + * We also update @start to index the next folio for the traversal. + */ +unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(folio)) + continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; + } + } + + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a page at index -1 but that is + * already broken anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios); + +static inline +bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) +{ + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + return false; + if (index >= max) + return false; + return index < folio->index + folio_nr_pages(folio) - 1; +} + +/** + * filemap_get_folios_contig - Get a batch of contiguous folios + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_contig() works exactly like filemap_get_folios(), + * except the returned folios are guaranteed to be contiguous. This may + * not return all contiguous folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ + +unsigned filemap_get_folios_contig(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + unsigned long nr; + struct folio *folio; + + rcu_read_lock(); + + for (folio = xas_load(&xas); folio && xas.xa_index <= end; + folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(folio)) + goto update_start; + + if (!folio_try_get_rcu(folio)) + goto retry; + + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; + + if (!folio_batch_add(fbatch, folio)) { + nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; + } + continue; +put_folio: + folio_put(folio); + +retry: + xas_reset(&xas); + } + +update_start: + nr = folio_batch_count(fbatch); + + if (nr) { + folio = fbatch->folios[nr - 1]; + if (folio_test_hugetlb(folio)) + *start = folio->index + 1; + else + *start = folio->index + folio_nr_pages(folio); + } +out: + rcu_read_unlock(); + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios_contig); + +/** + * filemap_get_folios_tag - Get a batch of folios matching @tag + * @mapping: The address_space to search + * @start: The starting page index + * @end: The final page index (inclusive) + * @tag: The tag index + * @fbatch: The batch to fill + * + * Same as filemap_get_folios(), but only returning folios tagged with @tag. + * + * Return: The number of folios found. + * Also update @start to index the next folio for traversal. + */ +unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, + pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, tag)) != NULL) { + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(folio)) + continue; + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + + if (folio_test_hugetlb(folio)) + nr = 1; + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a page at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} +EXPORT_SYMBOL(filemap_get_folios_tag); + +/** + * find_get_pages_range_tag - Find and return head pages matching @tag. + * @mapping: the address_space to search + * @index: the starting page index + * @end: The final page index (inclusive) + * @tag: the tag index + * @nr_pages: the maximum number of pages + * @pages: where the resulting pages are placed + * + * Like find_get_pages_range(), except we only return head pages which are + * tagged with @tag. @index is updated to the index immediately after the + * last page we return, ready for the next iteration. + * + * Return: the number of pages which were found. + */ +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, + struct page **pages) +{ + XA_STATE(xas, &mapping->i_pages, *index); + struct folio *folio; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, tag))) { + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(folio)) + continue; + + pages[ret] = &folio->page; + if (++ret == nr_pages) { + *index = folio->index + folio_nr_pages(folio); + goto out; + } + } + + /* + * We come here when we got to @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is a page at index -1 but that is already + * broken anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(find_get_pages_range_tag); + +/* + * CD/DVDs are error prone. When a medium error occurs, the driver may fail + * a _large_ part of the i/o request. Imagine the worst scenario: + * + * ---R__________________________________________B__________ + * ^ reading here ^ bad block(assume 4k) + * + * read(R) => miss => readahead(R...B) => media error => frustrating retries + * => failing the whole request => read(R) => read(R+1) => + * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) => + * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) => + * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ...... + * + * It is going insane. Fix it by quickly scaling down the readahead size. + */ +static void shrink_readahead_size_eio(struct file_ra_state *ra) +{ + ra->ra_pages /= 4; +} + +/* + * filemap_get_read_batch - Get a batch of folios for read + * + * Get a batch of folios which represent a contiguous range of bytes in + * the file. No exceptional entries will be returned. If @index is in + * the middle of a folio, the entire folio will be returned. The last + * folio in the batch may have the readahead flag set or the uptodate flag + * clear so that the caller can take the appropriate action. + */ +static void filemap_get_read_batch(struct address_space *mapping, + pgoff_t index, pgoff_t max, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, index); + struct folio *folio; + + rcu_read_lock(); + for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) { + if (xas_retry(&xas, folio)) + continue; + if (xas.xa_index > max || xa_is_value(folio)) + break; + if (xa_is_sibling(folio)) + break; + if (!folio_try_get_rcu(folio)) + goto retry; + + if (unlikely(folio != xas_reload(&xas))) + goto put_folio; + + if (!folio_batch_add(fbatch, folio)) + break; + if (!folio_test_uptodate(folio)) + break; + if (folio_test_readahead(folio)) + break; + xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); + continue; +put_folio: + folio_put(folio); +retry: + xas_reset(&xas); + } + rcu_read_unlock(); +} + +static int filemap_read_folio(struct file *file, filler_t filler, + struct folio *folio) +{ + bool workingset = folio_test_workingset(folio); + unsigned long pflags; + int error; + + /* + * A previous I/O error may have been due to temporary failures, + * eg. multipath errors. PG_error will be set again if read_folio + * fails. + */ + folio_clear_error(folio); + + /* Start the actual read. The read will unlock the page. */ + if (unlikely(workingset)) + psi_memstall_enter(&pflags); + error = filler(file, folio); + if (unlikely(workingset)) + psi_memstall_leave(&pflags); + if (error) + return error; + + error = folio_wait_locked_killable(folio); + if (error) + return error; + if (folio_test_uptodate(folio)) + return 0; + if (file) + shrink_readahead_size_eio(&file->f_ra); + return -EIO; +} + +static bool filemap_range_uptodate(struct address_space *mapping, + loff_t pos, struct iov_iter *iter, struct folio *folio) +{ + int count; + + if (folio_test_uptodate(folio)) + return true; + /* pipes can't handle partially uptodate pages */ + if (iov_iter_is_pipe(iter)) + return false; + if (!mapping->a_ops->is_partially_uptodate) + return false; + if (mapping->host->i_blkbits >= folio_shift(folio)) + return false; + + count = iter->count; + if (folio_pos(folio) > pos) { + count -= folio_pos(folio) - pos; + pos = 0; + } else { + pos -= folio_pos(folio); + } + + return mapping->a_ops->is_partially_uptodate(folio, pos, count); +} + +static int filemap_update_page(struct kiocb *iocb, + struct address_space *mapping, struct iov_iter *iter, + struct folio *folio) +{ + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!filemap_invalidate_trylock_shared(mapping)) + return -EAGAIN; + } else { + filemap_invalidate_lock_shared(mapping); + } + + if (!folio_trylock(folio)) { + error = -EAGAIN; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) + goto unlock_mapping; + if (!(iocb->ki_flags & IOCB_WAITQ)) { + filemap_invalidate_unlock_shared(mapping); + /* + * This is where we usually end up waiting for a + * previously submitted readahead to finish. + */ + folio_put_wait_locked(folio, TASK_KILLABLE); + return AOP_TRUNCATED_PAGE; + } + error = __folio_lock_async(folio, iocb->ki_waitq); + if (error) + goto unlock_mapping; + } + + error = AOP_TRUNCATED_PAGE; + if (!folio->mapping) + goto unlock; + + error = 0; + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio)) + goto unlock; + + error = -EAGAIN; + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) + goto unlock; + + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); + goto unlock_mapping; +unlock: + folio_unlock(folio); +unlock_mapping: + filemap_invalidate_unlock_shared(mapping); + if (error == AOP_TRUNCATED_PAGE) + folio_put(folio); + return error; +} + +static int filemap_create_folio(struct file *file, + struct address_space *mapping, pgoff_t index, + struct folio_batch *fbatch) +{ + struct folio *folio; + int error; + + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); + if (!folio) + return -ENOMEM; + + /* + * Protect against truncate / hole punch. Grabbing invalidate_lock + * here assures we cannot instantiate and bring uptodate new + * pagecache folios after evicting page cache during truncate + * and before actually freeing blocks. Note that we could + * release invalidate_lock after inserting the folio into + * the page cache as the locked folio would then be enough to + * synchronize with hole punching. But there are code paths + * such as filemap_update_page() filling in partially uptodate + * pages or ->readahead() that need to hold invalidate_lock + * while mapping blocks for IO so let's hold the lock here as + * well to keep locking rules simple. + */ + filemap_invalidate_lock_shared(mapping); + error = filemap_add_folio(mapping, folio, index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error == -EEXIST) + error = AOP_TRUNCATED_PAGE; + if (error) + goto error; + + error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + if (error) + goto error; + + filemap_invalidate_unlock_shared(mapping); + folio_batch_add(fbatch, folio); + return 0; +error: + filemap_invalidate_unlock_shared(mapping); + folio_put(folio); + return error; +} + +static int filemap_readahead(struct kiocb *iocb, struct file *file, + struct address_space *mapping, struct folio *folio, + pgoff_t last_index) +{ + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index); + + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_async_ra(&ractl, folio, last_index - folio->index); + return 0; +} + +static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, + struct folio_batch *fbatch) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + pgoff_t last_index; + struct folio *folio; + int err = 0; + + /* "last_index" is the index of the page beyond the end of the read */ + last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); +retry: + if (fatal_signal_pending(current)) + return -EINTR; + + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); + if (!folio_batch_count(fbatch)) { + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_sync_readahead(mapping, ra, filp, index, + last_index - index); + filemap_get_read_batch(mapping, index, last_index - 1, fbatch); + } + if (!folio_batch_count(fbatch)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + err = filemap_create_folio(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, fbatch); + if (err == AOP_TRUNCATED_PAGE) + goto retry; + return err; + } + + folio = fbatch->folios[folio_batch_count(fbatch) - 1]; + if (folio_test_readahead(folio)) { + err = filemap_readahead(iocb, filp, mapping, folio, last_index); + if (err) + goto err; + } + if (!folio_test_uptodate(folio)) { + if ((iocb->ki_flags & IOCB_WAITQ) && + folio_batch_count(fbatch) > 1) + iocb->ki_flags |= IOCB_NOWAIT; + err = filemap_update_page(iocb, mapping, iter, folio); + if (err) + goto err; + } + + return 0; +err: + if (err < 0) + folio_put(folio); + if (likely(--fbatch->nr)) + return 0; + if (err == AOP_TRUNCATED_PAGE) + goto retry; + return err; +} + +static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio) +{ + unsigned int shift = folio_shift(folio); + + return (pos1 >> shift == pos2 >> shift); +} + +/** + * filemap_read - Read data from the page cache. + * @iocb: The iocb to read. + * @iter: Destination for the data. + * @already_read: Number of bytes already read by the caller. + * + * Copies data from the page cache. If the data is not currently present, + * uses the readahead and read_folio address_space operations to fetch it. + * + * Return: Total number of bytes copied, including those already read by + * the caller. If an error happens before any bytes are copied, returns + * a negative error number. + */ +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, + ssize_t already_read) +{ + struct file *filp = iocb->ki_filp; + struct file_ra_state *ra = &filp->f_ra; + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + struct folio_batch fbatch; + int i, error = 0; + bool writably_mapped; + loff_t isize, end_offset; + + if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) + return 0; + if (unlikely(!iov_iter_count(iter))) + return 0; + + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + folio_batch_init(&fbatch); + + do { + cond_resched(); + + /* + * If we've already successfully copied some data, then we + * can no longer safely return -EIOCBQUEUED. Hence mark + * an async read NOWAIT at that point. + */ + if ((iocb->ki_flags & IOCB_WAITQ) && already_read) + iocb->ki_flags |= IOCB_NOWAIT; + + if (unlikely(iocb->ki_pos >= i_size_read(inode))) + break; + + error = filemap_get_pages(iocb, iter, &fbatch); + if (error < 0) + break; + + /* + * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ + isize = i_size_read(inode); + if (unlikely(iocb->ki_pos >= isize)) + goto put_folios; + end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); + + /* + * Pairs with a barrier in + * block_write_end()->mark_buffer_dirty() or other page + * dirtying routines like iomap_write_end() to ensure + * changes to page contents are visible before we see + * increased inode size. + */ + smp_rmb(); + + /* + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: + */ + writably_mapped = mapping_writably_mapped(mapping); + + /* + * When a read accesses the same folio several times, only + * mark it as accessed the first time. + */ + if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1, + fbatch.folios[0])) + folio_mark_accessed(fbatch.folios[0]); + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + size_t fsize = folio_size(folio); + size_t offset = iocb->ki_pos & (fsize - 1); + size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, + fsize - offset); + size_t copied; + + if (end_offset < folio_pos(folio)) + break; + if (i > 0) + folio_mark_accessed(folio); + /* + * If users can be writing to this folio using arbitrary + * virtual addresses, take care of potential aliasing + * before reading the folio on the kernel side. + */ + if (writably_mapped) + flush_dcache_folio(folio); + + copied = copy_folio_to_iter(folio, offset, bytes, iter); + + already_read += copied; + iocb->ki_pos += copied; + ra->prev_pos = iocb->ki_pos; + + if (copied < bytes) { + error = -EFAULT; + break; + } + } +put_folios: + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_put(fbatch.folios[i]); + folio_batch_init(&fbatch); + } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); + + file_accessed(filp); + + return already_read ? already_read : error; +} +EXPORT_SYMBOL_GPL(filemap_read); + +/** + * generic_file_read_iter - generic filesystem read routine + * @iocb: kernel I/O control block + * @iter: destination for the data read + * + * This is the "read_iter()" routine for all filesystems + * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * + * Return: + * * number of bytes copied, even for partial reads + * * negative error code (or 0 if IOCB_NOIO) if nothing was read + */ +ssize_t +generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + size_t count = iov_iter_count(iter); + ssize_t retval = 0; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1)) + return -EAGAIN; + } else { + retval = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (retval < 0) + return retval; + } + + file_accessed(file); + + retval = mapping->a_ops->direct_IO(iocb, iter); + if (retval >= 0) { + iocb->ki_pos += retval; + count -= retval; + } + if (retval != -EIOCBQUEUED) + iov_iter_revert(iter, count - iov_iter_count(iter)); + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. + */ + if (retval < 0 || !count || IS_DAX(inode)) + return retval; + if (iocb->ki_pos >= i_size_read(inode)) + return retval; + } + + return filemap_read(iocb, iter, retval); +} +EXPORT_SYMBOL(generic_file_read_iter); + +static inline loff_t folio_seek_hole_data(struct xa_state *xas, + struct address_space *mapping, struct folio *folio, + loff_t start, loff_t end, bool seek_data) +{ + const struct address_space_operations *ops = mapping->a_ops; + size_t offset, bsz = i_blocksize(mapping->host); + + if (xa_is_value(folio) || folio_test_uptodate(folio)) + return seek_data ? start : end; + if (!ops->is_partially_uptodate) + return seek_data ? end : start; + + xas_pause(xas); + rcu_read_unlock(); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) + goto unlock; + + offset = offset_in_folio(folio, start) & ~(bsz - 1); + + do { + if (ops->is_partially_uptodate(folio, offset, bsz) == + seek_data) + break; + start = (start + bsz) & ~(bsz - 1); + offset += bsz; + } while (offset < folio_size(folio)); +unlock: + folio_unlock(folio); + rcu_read_lock(); + return start; +} + +static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio) +{ + if (xa_is_value(folio)) + return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index); + return folio_size(folio); +} + +/** + * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * @mapping: Address space to search. + * @start: First byte to consider. + * @end: Limit of search (exclusive). + * @whence: Either SEEK_HOLE or SEEK_DATA. + * + * If the page cache knows which blocks contain holes and which blocks + * contain data, your filesystem can use this function to implement + * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are + * entirely memory-based such as tmpfs, and filesystems which support + * unwritten extents. + * + * Return: The requested offset on success, or -ENXIO if @whence specifies + * SEEK_DATA and there is no data after @start. There is an implicit hole + * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start + * and @end contain data. + */ +loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start, + loff_t end, int whence) +{ + XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT); + pgoff_t max = (end - 1) >> PAGE_SHIFT; + bool seek_data = (whence == SEEK_DATA); + struct folio *folio; + + if (end <= start) + return -ENXIO; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, max, XA_PRESENT))) { + loff_t pos = (u64)xas.xa_index << PAGE_SHIFT; + size_t seek_size; + + if (start < pos) { + if (!seek_data) + goto unlock; + start = pos; + } + + seek_size = seek_folio_size(&xas, folio); + pos = round_up((u64)pos + 1, seek_size); + start = folio_seek_hole_data(&xas, mapping, folio, start, pos, + seek_data); + if (start < pos) + goto unlock; + if (start >= end) + break; + if (seek_size > PAGE_SIZE) + xas_set(&xas, pos >> PAGE_SHIFT); + if (!xa_is_value(folio)) + folio_put(folio); + } + if (seek_data) + start = -ENXIO; +unlock: + rcu_read_unlock(); + if (folio && !xa_is_value(folio)) + folio_put(folio); + if (start > end) + return end; + return start; +} + +#ifdef CONFIG_MMU +#define MMAP_LOTSAMISS (100) +/* + * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock + * @vmf - the vm_fault for this fault. + * @folio - the folio to lock. + * @fpin - the pointer to the file we may pin (or is already pinned). + * + * This works similar to lock_folio_or_retry in that it can drop the + * mmap_lock. It differs in that it actually returns the folio locked + * if it returns 1 and 0 if it couldn't lock the folio. If we did have + * to drop the mmap_lock then fpin will point to the pinned file and + * needs to be fput()'ed at a later point. + */ +static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, + struct file **fpin) +{ + if (folio_trylock(folio)) + return 1; + + /* + * NOTE! This will make us return with VM_FAULT_RETRY, but with + * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT + * is supposed to work. We have way too many special cases.. + */ + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) + return 0; + + *fpin = maybe_unlock_mmap_for_io(vmf, *fpin); + if (vmf->flags & FAULT_FLAG_KILLABLE) { + if (__folio_lock_killable(folio)) { + /* + * We didn't have the right flags to drop the mmap_lock, + * but all fault_handlers only check for fatal signals + * if we return VM_FAULT_RETRY, so we need to drop the + * mmap_lock here and return 0 if we don't have a fpin. + */ + if (*fpin == NULL) + mmap_read_unlock(vmf->vma->vm_mm); + return 0; + } + } else + __folio_lock(folio); + + return 1; +} + +/* + * Synchronous readahead happens when we don't even find a page in the page + * cache at all. We don't want to perform IO under the mmap sem, so if we have + * to drop the mmap sem we return the file that was pinned in order for us to do + * that. If we didn't pin a file then we return NULL. The file that is + * returned needs to be fput()'ed when we're done with it. + */ +static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; + struct address_space *mapping = file->f_mapping; + DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); + struct file *fpin = NULL; + unsigned long vm_flags = vmf->vma->vm_flags; + unsigned int mmap_miss; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* Use the readahead code, even if readahead is disabled */ + if (vm_flags & VM_HUGEPAGE) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); + ra->size = HPAGE_PMD_NR; + /* + * Fetch two PMD folios, so we get the chance to actually + * readahead, unless we've been told not to. + */ + if (!(vm_flags & VM_RAND_READ)) + ra->size *= 2; + ra->async_size = HPAGE_PMD_NR; + page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER); + return fpin; + } +#endif + + /* If we don't want any read-ahead, don't bother */ + if (vm_flags & VM_RAND_READ) + return fpin; + if (!ra->ra_pages) + return fpin; + + if (vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + page_cache_sync_ra(&ractl, ra->ra_pages); + return fpin; + } + + /* Avoid banging the cache line if not needed */ + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss < MMAP_LOTSAMISS * 10) + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (mmap_miss > MMAP_LOTSAMISS) + return fpin; + + /* + * mmap read-around + */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; + ractl._index = ra->start; + page_cache_ra_order(&ractl, ra, 0); + return fpin; +} + +/* + * Asynchronous readahead happens when we find the page and PG_readahead, + * so we want to possibly extend the readahead further. We return the file that + * was pinned if we have to drop the mmap_lock in order to do IO. + */ +static struct file *do_async_mmap_readahead(struct vm_fault *vmf, + struct folio *folio) +{ + struct file *file = vmf->vma->vm_file; + struct file_ra_state *ra = &file->f_ra; + DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff); + struct file *fpin = NULL; + unsigned int mmap_miss; + + /* If we don't want any read-ahead, don't bother */ + if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) + return fpin; + + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); + + if (folio_test_readahead(folio)) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + page_cache_async_ra(&ractl, folio, ra->ra_pages); + } + return fpin; +} + +/** + * filemap_fault - read in file data for page fault handling + * @vmf: struct vm_fault containing details of the fault + * + * filemap_fault() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + * + * vma->vm_mm->mmap_lock must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock + * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap(). + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_lock + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. + * + * Return: bitwise-OR of %VM_FAULT_ codes. + */ +vm_fault_t filemap_fault(struct vm_fault *vmf) +{ + int error; + struct file *file = vmf->vma->vm_file; + struct file *fpin = NULL; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + pgoff_t max_idx, index = vmf->pgoff; + struct folio *folio; + vm_fault_t ret = 0; + bool mapping_locked = false; + + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) + return VM_FAULT_SIGBUS; + + /* + * Do we have something in the page cache already? + */ + folio = filemap_get_folio(mapping, index); + if (likely(folio)) { + /* + * We found the page, so try async readahead before waiting for + * the lock. + */ + if (!(vmf->flags & FAULT_FLAG_TRIED)) + fpin = do_async_mmap_readahead(vmf, folio); + if (unlikely(!folio_test_uptodate(folio))) { + filemap_invalidate_lock_shared(mapping); + mapping_locked = true; + } + } else { + /* No page in the page cache at all */ + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + ret = VM_FAULT_MAJOR; + fpin = do_sync_mmap_readahead(vmf); +retry_find: + /* + * See comment in filemap_create_folio() why we need + * invalidate_lock + */ + if (!mapping_locked) { + filemap_invalidate_lock_shared(mapping); + mapping_locked = true; + } + folio = __filemap_get_folio(mapping, index, + FGP_CREAT|FGP_FOR_MMAP, + vmf->gfp_mask); + if (!folio) { + if (fpin) + goto out_retry; + filemap_invalidate_unlock_shared(mapping); + return VM_FAULT_OOM; + } + } + + if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin)) + goto out_retry; + + /* Did it get truncated? */ + if (unlikely(folio->mapping != mapping)) { + folio_unlock(folio); + folio_put(folio); + goto retry_find; + } + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + + /* + * We have a locked page in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error. + */ + if (unlikely(!folio_test_uptodate(folio))) { + /* + * The page was in cache and uptodate and now it is not. + * Strange but possible since we didn't hold the page lock all + * the time. Let's drop everything get the invalidate lock and + * try again. + */ + if (!mapping_locked) { + folio_unlock(folio); + folio_put(folio); + goto retry_find; + } + goto page_not_uptodate; + } + + /* + * We've made it this far and we had to drop our mmap_lock, now is the + * time to return to the upper layer and have it re-find the vma and + * redo the fault. + */ + if (fpin) { + folio_unlock(folio); + goto out_retry; + } + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); + + /* + * Found the page and have a reference on it. + * We must recheck i_size under page lock. + */ + max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(index >= max_idx)) { + folio_unlock(folio); + folio_put(folio); + return VM_FAULT_SIGBUS; + } + + vmf->page = folio_file_page(folio, index); + return ret | VM_FAULT_LOCKED; + +page_not_uptodate: + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + if (fpin) + goto out_retry; + folio_put(folio); + + if (!error || error == AOP_TRUNCATED_PAGE) + goto retry_find; + filemap_invalidate_unlock_shared(mapping); + + return VM_FAULT_SIGBUS; + +out_retry: + /* + * We dropped the mmap_lock, we need to return to the fault handler to + * re-find the vma and come back and find our hopefully still populated + * page. + */ + if (folio) + folio_put(folio); + if (mapping_locked) + filemap_invalidate_unlock_shared(mapping); + if (fpin) + fput(fpin); + return ret | VM_FAULT_RETRY; +} +EXPORT_SYMBOL(filemap_fault); + +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) +{ + struct mm_struct *mm = vmf->vma->vm_mm; + + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } + } + + if (pmd_none(*vmf->pmd) && vmf->prealloc_pte) + pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + return false; +} + +static struct folio *next_uptodate_page(struct folio *folio, + struct address_space *mapping, + struct xa_state *xas, pgoff_t end_pgoff) +{ + unsigned long max_idx; + + do { + if (!folio) + return NULL; + if (xas_retry(xas, folio)) + continue; + if (xa_is_value(folio)) + continue; + if (folio_test_locked(folio)) + continue; + if (!folio_try_get_rcu(folio)) + continue; + /* Has the page moved or been split? */ + if (unlikely(folio != xas_reload(xas))) + goto skip; + if (!folio_test_uptodate(folio) || folio_test_readahead(folio)) + goto skip; + if (!folio_trylock(folio)) + goto skip; + if (folio->mapping != mapping) + goto unlock; + if (!folio_test_uptodate(folio)) + goto unlock; + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (xas->xa_index >= max_idx) + goto unlock; + return folio; +unlock: + folio_unlock(folio); +skip: + folio_put(folio); + } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL); + + return NULL; +} + +static inline struct folio *first_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_find(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +static inline struct folio *next_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_next_entry(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + pgoff_t last_pgoff = start_pgoff; + unsigned long addr; + XA_STATE(xas, &mapping->i_pages, start_pgoff); + struct folio *folio; + struct page *page; + unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + vm_fault_t ret = 0; + + rcu_read_lock(); + folio = first_map_page(mapping, &xas, end_pgoff); + if (!folio) + goto out; + + if (filemap_map_pmd(vmf, &folio->page)) { + ret = VM_FAULT_NOPAGE; + goto out; + } + + addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + do { +again: + page = folio_file_page(folio, xas.xa_index); + if (PageHWPoison(page)) + goto unlock; + + if (mmap_miss > 0) + mmap_miss--; + + addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; + + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit the + * fault-around logic. + */ + if (!pte_none(*vmf->pte)) + goto unlock; + + /* We're about to handle the fault */ + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + do_set_pte(vmf, page, addr); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, addr, vmf->pte); + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + folio_ref_inc(folio); + goto again; + } + folio_unlock(folio); + continue; +unlock: + if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { + xas.xa_index++; + goto again; + } + folio_unlock(folio); + folio_put(folio); + } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); + pte_unmap_unlock(vmf->pte, vmf->ptl); +out: + rcu_read_unlock(); + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + return ret; +} +EXPORT_SYMBOL(filemap_map_pages); + +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct folio *folio = page_folio(vmf->page); + vm_fault_t ret = VM_FAULT_LOCKED; + + sb_start_pagefault(mapping->host->i_sb); + file_update_time(vmf->vma->vm_file); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + /* + * We mark the folio dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty folio and writeprotect it again. + */ + folio_mark_dirty(folio); + folio_wait_stable(folio); +out: + sb_end_pagefault(mapping->host->i_sb); + return ret; +} + +const struct vm_operations_struct generic_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = filemap_page_mkwrite, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + + if (!mapping->a_ops->read_folio) + return -ENOEXEC; + file_accessed(file); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * This is for filesystems which do not implement ->writepage. + */ +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + return generic_file_mmap(file, vma); +} +#else +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + return -ENOSYS; +} +#endif /* CONFIG_MMU */ + +EXPORT_SYMBOL(filemap_page_mkwrite); +EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap); + +static struct folio *do_read_cache_folio(struct address_space *mapping, + pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) +{ + struct folio *folio; + int err; + + if (!filler) + filler = mapping->a_ops->read_folio; +repeat: + folio = filemap_get_folio(mapping, index); + if (!folio) { + folio = filemap_alloc_folio(gfp, 0); + if (!folio) + return ERR_PTR(-ENOMEM); + err = filemap_add_folio(mapping, folio, index, gfp); + if (unlikely(err)) { + folio_put(folio); + if (err == -EEXIST) + goto repeat; + /* Presumably ENOMEM for xarray node */ + return ERR_PTR(err); + } + + goto filler; + } + if (folio_test_uptodate(folio)) + goto out; + + if (!folio_trylock(folio)) { + folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + goto repeat; + } + + /* Folio was truncated from mapping */ + if (!folio->mapping) { + folio_unlock(folio); + folio_put(folio); + goto repeat; + } + + /* Someone else locked and filled the page in a very small window */ + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + goto out; + } + +filler: + err = filemap_read_folio(file, filler, folio); + if (err) { + folio_put(folio); + if (err == AOP_TRUNCATED_PAGE) + goto repeat; + return ERR_PTR(err); + } + +out: + folio_mark_accessed(folio); + return folio; +} + +/** + * read_cache_folio - Read into page cache, fill it if needed. + * @mapping: The address_space to read from. + * @index: The index to read. + * @filler: Function to perform the read, or NULL to use aops->read_folio(). + * @file: Passed to filler function, may be NULL if not required. + * + * Read one page into the page cache. If it succeeds, the folio returned + * will contain @index, but it may not be the first page of the folio. + * + * If the filler function returns an error, it will be returned to the + * caller. + * + * Context: May sleep. Expects mapping->invalidate_lock to be held. + * Return: An uptodate folio on success, ERR_PTR() on failure. + */ +struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, + filler_t filler, struct file *file) +{ + return do_read_cache_folio(mapping, index, filler, file, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_folio); + +static struct page *do_read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) +{ + struct folio *folio; + + folio = do_read_cache_folio(mapping, index, filler, file, gfp); + if (IS_ERR(folio)) + return &folio->page; + return folio_file_page(folio, index); +} + +struct page *read_cache_page(struct address_space *mapping, + pgoff_t index, filler_t *filler, struct file *file) +{ + return do_read_cache_page(mapping, index, filler, file, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(read_cache_page); + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. + * + * If the page does not get brought uptodate, return -EIO. + * + * The function expects mapping->invalidate_lock to be already held. + * + * Return: up to date page on success, ERR_PTR() on failure. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + return do_read_cache_page(mapping, index, NULL, NULL, gfp); +} +EXPORT_SYMBOL(read_cache_page_gfp); + +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + char *path; + + errseq_set(&filp->f_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + +ssize_t +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + ssize_t written; + size_t write_len; + pgoff_t end; + + write_len = iov_iter_count(from); + end = (pos + write_len - 1) >> PAGE_SHIFT; + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* If there are pages to writeback, return */ + if (filemap_range_has_page(file->f_mapping, pos, + pos + write_len - 1)) + return -EAGAIN; + } else { + written = filemap_write_and_wait_range(mapping, pos, + pos + write_len - 1); + if (written) + goto out; + } + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; + } + + written = mapping->a_ops->direct_IO(iocb, from); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + * + * Most of the time we do not need this since dio_complete() will do + * the invalidation for us. However there are some file systems that + * do not end up with dio_complete() being called, so let's not break + * them by removing it completely. + * + * Noticeable example is a blkdev_direct_IO(). + * + * Skip invalidation for async writes or if mapping has no pages. + */ + if (written > 0 && mapping->nrpages && + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) + dio_warn_stale_pagecache(file); + + if (written > 0) { + pos += written; + write_len -= written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + iocb->ki_pos = pos; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, write_len - iov_iter_count(from)); +out: + return written; +} +EXPORT_SYMBOL(generic_file_direct_write); + +ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) +{ + struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata = NULL; + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_count(i)); + +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { + status = -EFAULT; + break; + } + + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, + &page, &fsdata); + if (unlikely(status < 0)) + break; + + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + + copied = copy_page_from_iter_atomic(page, offset, bytes, i); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status != copied)) { + iov_iter_revert(i, copied - max(status, 0L)); + if (unlikely(status < 0)) + break; + } + cond_resched(); + + if (unlikely(status == 0)) { + /* + * A short copy made ->write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. + */ + if (copied) + bytes = copied; + goto again; + } + pos += status; + written += status; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(i)); + + return written ? written : status; +} +EXPORT_SYMBOL(generic_perform_write); + +/** + * __generic_file_write_iter - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_rwsem to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_rwsem. + * + * Return: + * * number of bytes written, even for truncated writes + * * negative error code if no data has been written at all + */ +ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t written = 0; + ssize_t err; + ssize_t status; + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_privs(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos, endbyte; + + written = generic_file_direct_write(iocb, from); + /* + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). + */ + if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) + goto out; + + pos = iocb->ki_pos; + status = generic_perform_write(iocb, from); + /* + * If generic_perform_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (unlikely(status < 0)) { + err = status; + goto out; + } + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + status - 1; + err = filemap_write_and_wait_range(mapping, pos, endbyte); + if (err == 0) { + iocb->ki_pos = endbyte + 1; + written += status; + invalidate_mapping_pages(mapping, + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_perform_write(iocb, from); + if (likely(written > 0)) + iocb->ki_pos += written; + } +out: + current->backing_dev_info = NULL; + return written ? written : err; +} +EXPORT_SYMBOL(__generic_file_write_iter); + +/** + * generic_file_write_iter - write data to a file + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * This is a wrapper around __generic_file_write_iter() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_rwsem as needed. + * Return: + * * negative error code if no data has been written at all of + * vfs_fsync_range() failed for a synchronous write + * * number of bytes written, even for truncated writes + */ +ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) + ret = __generic_file_write_iter(iocb, from); + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +EXPORT_SYMBOL(generic_file_write_iter); + +/** + * filemap_release_folio() - Release fs-specific metadata on a folio. + * @folio: The folio which the kernel is trying to free. + * @gfp: Memory allocation flags (and I/O mode). + * + * The address_space is trying to release any data attached to a folio + * (presumably at folio->private). + * + * This will also be called if the private_2 flag is set on a page, + * indicating that the folio has other metadata associated with it. + * + * The @gfp argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block + * (__GFP_RECLAIM & __GFP_FS). + * + * Return: %true if the release was successful, otherwise %false. + */ +bool filemap_release_folio(struct folio *folio, gfp_t gfp) +{ + struct address_space * const mapping = folio->mapping; + + BUG_ON(!folio_test_locked(folio)); + if (!folio_needs_release(folio)) + return true; + if (folio_test_writeback(folio)) + return false; + + if (mapping && mapping->a_ops->release_folio) + return mapping->a_ops->release_folio(folio, gfp); + return try_to_free_buffers(folio); +} +EXPORT_SYMBOL(filemap_release_folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c new file mode 100644 index 000000000..e1e23b494 --- /dev/null +++ b/mm/folio-compat.c @@ -0,0 +1,148 @@ +/* + * Compatibility functions which bloat the callers too much to make inline. + * All of the callers of these functions should be converted to use folios + * eventually. + */ + +#include +#include +#include +#include "internal.h" + +struct address_space *page_mapping(struct page *page) +{ + return folio_mapping(page_folio(page)); +} +EXPORT_SYMBOL(page_mapping); + +void unlock_page(struct page *page) +{ + return folio_unlock(page_folio(page)); +} +EXPORT_SYMBOL(unlock_page); + +void end_page_writeback(struct page *page) +{ + return folio_end_writeback(page_folio(page)); +} +EXPORT_SYMBOL(end_page_writeback); + +void wait_on_page_writeback(struct page *page) +{ + return folio_wait_writeback(page_folio(page)); +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback); + +void wait_for_stable_page(struct page *page) +{ + return folio_wait_stable(page_folio(page)); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); + +bool page_mapped(struct page *page) +{ + return folio_mapped(page_folio(page)); +} +EXPORT_SYMBOL(page_mapped); + +void mark_page_accessed(struct page *page) +{ + folio_mark_accessed(page_folio(page)); +} +EXPORT_SYMBOL(mark_page_accessed); + +bool set_page_writeback(struct page *page) +{ + return folio_start_writeback(page_folio(page)); +} +EXPORT_SYMBOL(set_page_writeback); + +bool set_page_dirty(struct page *page) +{ + return folio_mark_dirty(page_folio(page)); +} +EXPORT_SYMBOL(set_page_dirty); + +int __set_page_dirty_nobuffers(struct page *page) +{ + return filemap_dirty_folio(page_mapping(page), page_folio(page)); +} +EXPORT_SYMBOL(__set_page_dirty_nobuffers); + +bool clear_page_dirty_for_io(struct page *page) +{ + return folio_clear_dirty_for_io(page_folio(page)); +} +EXPORT_SYMBOL(clear_page_dirty_for_io); + +bool redirty_page_for_writepage(struct writeback_control *wbc, + struct page *page) +{ + return folio_redirty_for_writepage(wbc, page_folio(page)); +} +EXPORT_SYMBOL(redirty_page_for_writepage); + +void lru_cache_add(struct page *page) +{ + folio_add_lru(page_folio(page)); +} +EXPORT_SYMBOL(lru_cache_add); + +void lru_cache_add_inactive_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + folio_add_lru_vma(page_folio(page), vma); +} + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return filemap_add_folio(mapping, page_folio(page), index, gfp); +} +EXPORT_SYMBOL(add_to_page_cache_lru); + +noinline +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, + int fgp_flags, gfp_t gfp) +{ + struct folio *folio; + + folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); + if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + return &folio->page; + return folio_file_page(folio, index); +} +EXPORT_SYMBOL(pagecache_get_page); + +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index) +{ + unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + + return pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); +} +EXPORT_SYMBOL(grab_cache_page_write_begin); + +void delete_from_page_cache(struct page *page) +{ + return filemap_remove_folio(page_folio(page)); +} + +int try_to_release_page(struct page *page, gfp_t gfp) +{ + return filemap_release_folio(page_folio(page), gfp); +} +EXPORT_SYMBOL(try_to_release_page); + +int isolate_lru_page(struct page *page) +{ + if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) + return -EBUSY; + return folio_isolate_lru((struct folio *)page); +} + +void putback_lru_page(struct page *page) +{ + folio_putback_lru(page_folio(page)); +} diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000..279e55b4e --- /dev/null +++ b/mm/frontswap.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Frontswap frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of frontswap. See + * Documentation/mm/frontswap.rst for more information. + * + * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); + +/* + * frontswap_ops are added by frontswap_register_ops, and provide the + * frontswap "backend" implementation functions. Multiple implementations + * may be registered, but implementations can never deregister. This + * is a simple singly-linked list of all registered implementations. + */ +static const struct frontswap_ops *frontswap_ops __read_mostly; + +#ifdef CONFIG_DEBUG_FS +/* + * Counters available via /sys/kernel/debug/frontswap (if debugfs is + * properly configured). These are for information only so are not protected + * against increment races. + */ +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; +static u64 frontswap_invalidates; + +static inline void inc_frontswap_loads(void) +{ + data_race(frontswap_loads++); +} +static inline void inc_frontswap_succ_stores(void) +{ + data_race(frontswap_succ_stores++); +} +static inline void inc_frontswap_failed_stores(void) +{ + data_race(frontswap_failed_stores++); +} +static inline void inc_frontswap_invalidates(void) +{ + data_race(frontswap_invalidates++); +} +#else +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } +static inline void inc_frontswap_invalidates(void) { } +#endif + +/* + * Due to the asynchronous nature of the backends loading potentially + * _after_ the swap system has been activated, we have chokepoints + * on all frontswap functions to not call the backend until the backend + * has registered. + * + * This would not guards us against the user deciding to call swapoff right as + * we are calling the backend to initialize (so swapon is in action). + * Fortunately for us, the swapon_mutex has been taken by the callee so we are + * OK. The other scenario where calls to frontswap_store (called via + * swap_writepage) is racing with frontswap_invalidate_area (called via + * swapoff) is again guarded by the swap subsystem. + * + * While no backend is registered all calls to frontswap_[store|load| + * invalidate_area|invalidate_page] are ignored or fail. + * + * The time between the backend being registered and the swap file system + * calling the backend (via the frontswap_* functions) is indeterminate as + * frontswap_ops is not atomic_t (or a value guarded by a spinlock). + * That is OK as we are comfortable missing some of these calls to the newly + * registered backend. + * + * Obviously the opposite (unloading the backend) must be done after all + * the frontswap_[store|load|invalidate_area|invalidate_page] start + * ignoring or failing the requests. However, there is currently no way + * to unload a backend once it is registered. + */ + +/* + * Register operations for frontswap + */ +int frontswap_register_ops(const struct frontswap_ops *ops) +{ + if (frontswap_ops) + return -EINVAL; + + frontswap_ops = ops; + static_branch_inc(&frontswap_enabled_key); + return 0; +} + +/* + * Called when a swap device is swapon'd. + */ +void frontswap_init(unsigned type, unsigned long *map) +{ + struct swap_info_struct *sis = swap_info[type]; + + VM_BUG_ON(sis == NULL); + + /* + * p->frontswap is a bitmap that we MUST have to figure out which page + * has gone in frontswap. Without it there is no point of continuing. + */ + if (WARN_ON(!map)) + return; + /* + * Irregardless of whether the frontswap backend has been loaded + * before this function or it will be later, we _MUST_ have the + * p->frontswap set to something valid to work properly. + */ + frontswap_map_set(sis, map); + + if (!frontswap_enabled()) + return; + frontswap_ops->init(type); +} + +static bool __frontswap_test(struct swap_info_struct *sis, + pgoff_t offset) +{ + if (sis->frontswap_map) + return test_bit(offset, sis->frontswap_map); + return false; +} + +static inline void __frontswap_set(struct swap_info_struct *sis, + pgoff_t offset) +{ + set_bit(offset, sis->frontswap_map); + atomic_inc(&sis->frontswap_pages); +} + +static inline void __frontswap_clear(struct swap_info_struct *sis, + pgoff_t offset) +{ + clear_bit(offset, sis->frontswap_map); + atomic_dec(&sis->frontswap_pages); +} + +/* + * "Store" data from a page to frontswap and associate it with the page's + * swaptype and offset. Page must be locked and in the swap cache. + * If frontswap already contains a page with matching swaptype and + * offset, the frontswap implementation may either overwrite the data and + * return success or invalidate the page from frontswap and return failure. + */ +int __frontswap_store(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(sis == NULL); + + /* + * If a dup, we must remove the old page first; we can't leave the + * old page no matter if the store of the new page succeeds or fails, + * and we can't rely on the new page replacing the old page as we may + * not store to the same implementation that contains the old page. + */ + if (__frontswap_test(sis, offset)) { + __frontswap_clear(sis, offset); + frontswap_ops->invalidate_page(type, offset); + } + + ret = frontswap_ops->store(type, offset, page); + if (ret == 0) { + __frontswap_set(sis, offset); + inc_frontswap_succ_stores(); + } else { + inc_frontswap_failed_stores(); + } + + return ret; +} + +/* + * "Get" data from frontswap associated with swaptype and offset that were + * specified when the data was put to frontswap and use it to fill the + * specified page with data. Page must be locked and in the swap cache. + */ +int __frontswap_load(struct page *page) +{ + int ret = -1; + swp_entry_t entry = { .val = page_private(page), }; + int type = swp_type(entry); + struct swap_info_struct *sis = swap_info[type]; + pgoff_t offset = swp_offset(entry); + + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(sis == NULL); + + if (!__frontswap_test(sis, offset)) + return -1; + + /* Try loading from each implementation, until one succeeds. */ + ret = frontswap_ops->load(type, offset, page); + if (ret == 0) + inc_frontswap_loads(); + return ret; +} + +/* + * Invalidate any data from frontswap associated with the specified swaptype + * and offset so that a subsequent "get" will fail. + */ +void __frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct swap_info_struct *sis = swap_info[type]; + + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(sis == NULL); + + if (!__frontswap_test(sis, offset)) + return; + + frontswap_ops->invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); +} + +/* + * Invalidate all data from frontswap associated with all offsets for the + * specified swaptype. + */ +void __frontswap_invalidate_area(unsigned type) +{ + struct swap_info_struct *sis = swap_info[type]; + + VM_BUG_ON(!frontswap_ops); + VM_BUG_ON(sis == NULL); + + if (sis->frontswap_map == NULL) + return; + + frontswap_ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); +} + +static int __init init_frontswap(void) +{ +#ifdef CONFIG_DEBUG_FS + struct dentry *root = debugfs_create_dir("frontswap", NULL); + if (root == NULL) + return -ENXIO; + debugfs_create_u64("loads", 0444, root, &frontswap_loads); + debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", 0444, root, + &frontswap_failed_stores); + debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates); +#endif + return 0; +} + +module_init(init_frontswap); diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 000000000..f4911ddd3 --- /dev/null +++ b/mm/gup.c @@ -0,0 +1,3305 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +struct follow_page_context { + struct dev_pagemap *pgmap; + unsigned int page_mask; +}; + +static inline void sanity_check_pinned_pages(struct page **pages, + unsigned long npages) +{ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + /* + * We only pin anonymous pages if they are exclusive. Once pinned, we + * can no longer turn them possibly shared and PageAnonExclusive() will + * stick around until the page is freed. + * + * We'd like to verify that our pinned anonymous pages are still mapped + * exclusively. The issue with anon THP is that we don't know how + * they are/were mapped when pinning them. However, for anon + * THP we can assume that either the given page (PTE-mapped THP) or + * the head page (PMD-mapped THP) should be PageAnonExclusive(). If + * neither is the case, there is certainly something wrong. + */ + for (; npages; npages--, pages++) { + struct page *page = *pages; + struct folio *folio = page_folio(page); + + if (!folio_test_anon(folio)) + continue; + if (!folio_test_large(folio) || folio_test_hugetlb(folio)) + VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); + else + /* Either a PTE-mapped or a PMD-mapped THP. */ + VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && + !PageAnonExclusive(page), page); + } +} + +/* + * Return the folio with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct folio *try_get_folio(struct page *page, int refs) +{ + struct folio *folio; + +retry: + folio = page_folio(page); + if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) + return NULL; + if (unlikely(!folio_ref_try_add_rcu(folio, refs))) + return NULL; + + /* + * At this point we have a stable reference to the folio; but it + * could be that between calling page_folio() and the refcount + * increment, the folio was split, in which case we'd end up + * holding a reference on a folio that has nothing to do with the page + * we were given anymore. + * So now that the folio is stable, recheck that the page still + * belongs to this folio. + */ + if (unlikely(page_folio(page) != folio)) { + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); + goto retry; + } + + return folio; +} + +/** + * try_grab_folio() - Attempt to get or pin a folio. + * @page: pointer to page to be grabbed + * @refs: the value to (effectively) add to the folio's refcount + * @flags: gup flags: these are the FOLL_* flag values. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the + * same time. (That's true throughout the get_user_pages*() and + * pin_user_pages*() APIs.) Cases: + * + * FOLL_GET: folio's refcount will be incremented by @refs. + * + * FOLL_PIN on large folios: folio's refcount will be incremented by + * @refs, and its compound_pincount will be incremented by @refs. + * + * FOLL_PIN on single-page folios: folio's refcount will be incremented by + * @refs * GUP_PIN_COUNTING_BIAS. + * + * Return: The folio containing @page (with refcount appropriately + * incremented) for success, or NULL upon failure. If neither FOLL_GET + * nor FOLL_PIN was set, that's considered failure, and furthermore, + * a likely bug in the caller, so a warning is also emitted. + */ +struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) +{ + if (flags & FOLL_GET) + return try_get_folio(page, refs); + else if (flags & FOLL_PIN) { + struct folio *folio; + + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !is_longterm_pinnable_page(page))) + return NULL; + + /* + * CAUTION: Don't use compound_head() on the page before this + * point, the result won't be stable. + */ + folio = try_get_folio(page, refs); + if (!folio) + return NULL; + + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, folio_pincount_ptr(folio)); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in page_try_share_anon_rmap(). + */ + smp_mb__after_atomic(); + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + + return folio; + } + + WARN_ON_ONCE(1); + return NULL; +} + +static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) +{ + if (flags & FOLL_PIN) { + node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); + if (folio_test_large(folio)) + atomic_sub(refs, folio_pincount_ptr(folio)); + else + refs *= GUP_PIN_COUNTING_BIAS; + } + + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); +} + +/** + * try_grab_page() - elevate a page's refcount by a flag-dependent amount + * @page: pointer to page to be grabbed + * @flags: gup flags: these are the FOLL_* flag values. + * + * This might not do anything at all, depending on the flags argument. + * + * "grab" names in this file mean, "look at flags to decide whether to use + * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. + * + * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same + * time. Cases: please see the try_grab_folio() documentation, with + * "refs=1". + * + * Return: true for success, or if no action was required (if neither FOLL_PIN + * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or + * FOLL_PIN was set, but the page could not be grabbed. + */ +bool __must_check try_grab_page(struct page *page, unsigned int flags) +{ + struct folio *folio = page_folio(page); + + WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); + if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) + return false; + + if (flags & FOLL_GET) + folio_ref_inc(folio); + else if (flags & FOLL_PIN) { + /* + * Similar to try_grab_folio(): be sure to *also* + * increment the normal page refcount field at least once, + * so that the page really is pinned. + */ + if (folio_test_large(folio)) { + folio_ref_add(folio, 1); + atomic_add(1, folio_pincount_ptr(folio)); + } else { + folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + } + + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); + } + + return true; +} + +/** + * unpin_user_page() - release a dma-pinned page + * @page: pointer to page to be released + * + * Pages that were pinned via pin_user_pages*() must be released via either + * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so + * that such pages can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special handling. + */ +void unpin_user_page(struct page *page) +{ + sanity_check_pinned_pages(&page, 1); + gup_put_folio(page_folio(page), 1, FOLL_PIN); +} +EXPORT_SYMBOL(unpin_user_page); + +static inline struct folio *gup_folio_range_next(struct page *start, + unsigned long npages, unsigned long i, unsigned int *ntails) +{ + struct page *next = nth_page(start, i); + struct folio *folio = page_folio(next); + unsigned int nr = 1; + + if (folio_test_large(folio)) + nr = min_t(unsigned int, npages - i, + folio_nr_pages(folio) - folio_page_idx(folio, next)); + + *ntails = nr; + return folio; +} + +static inline struct folio *gup_folio_next(struct page **list, + unsigned long npages, unsigned long i, unsigned int *ntails) +{ + struct folio *folio = page_folio(list[i]); + unsigned int nr; + + for (nr = i + 1; nr < npages; nr++) { + if (page_folio(list[nr]) != folio) + break; + } + + *ntails = nr - i; + return folio; +} + +/** + * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages + * @pages: array of pages to be maybe marked dirty, and definitely released. + * @npages: number of pages in the @pages array. + * @make_dirty: whether to mark the pages dirty + * + * "gup-pinned page" refers to a page that has had one of the get_user_pages() + * variants called on that page. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if @make_dirty is true, and if the page was previously + * listed as clean. In any case, releases all pages using unpin_user_page(), + * possibly via unpin_user_pages(), for the non-dirty case. + * + * Please see the unpin_user_page() documentation for details. + * + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), unpin_user_page(). + * + */ +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty) +{ + unsigned long i; + struct folio *folio; + unsigned int nr; + + if (!make_dirty) { + unpin_user_pages(pages, npages); + return; + } + + sanity_check_pinned_pages(pages, npages); + for (i = 0; i < npages; i += nr) { + folio = gup_folio_next(pages, npages, i, &nr); + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key + * cases: + * + * 1) This code sees the page as already dirty, so it + * skips the call to set_page_dirty(). That could happen + * because clear_page_dirty_for_io() called + * page_mkclean(), followed by set_page_dirty(). + * However, now the page is going to get written back, + * which meets the original intention of setting it + * dirty, so all is well: clear_page_dirty_for_io() goes + * on to call TestClearPageDirty(), and write the page + * back. + * + * 2) This code sees the page as clean, so it calls + * set_page_dirty(). The page stays dirty, despite being + * written back, so it gets written back again in the + * next writeback cycle. This is harmless. + */ + if (!folio_test_dirty(folio)) { + folio_lock(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + } + gup_put_folio(folio, nr, FOLL_PIN); + } +} +EXPORT_SYMBOL(unpin_user_pages_dirty_lock); + +/** + * unpin_user_page_range_dirty_lock() - release and optionally dirty + * gup-pinned page range + * + * @page: the starting page of a range maybe marked dirty, and definitely released. + * @npages: number of consecutive pages to release. + * @make_dirty: whether to mark the pages dirty + * + * "gup-pinned page range" refers to a range of pages that has had one of the + * pin_user_pages() variants called on that page. + * + * For the page ranges defined by [page .. page+npages], make that range (or + * its head pages, if a compound page) dirty, if @make_dirty is true, and if the + * page range was previously listed as clean. + * + * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is + * required, then the caller should a) verify that this is really correct, + * because _lock() is usually required, and b) hand code it: + * set_page_dirty_lock(), unpin_user_page(). + * + */ +void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, + bool make_dirty) +{ + unsigned long i; + struct folio *folio; + unsigned int nr; + + for (i = 0; i < npages; i += nr) { + folio = gup_folio_range_next(page, npages, i, &nr); + if (make_dirty && !folio_test_dirty(folio)) { + folio_lock(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + } + gup_put_folio(folio, nr, FOLL_PIN); + } +} +EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); + +static void unpin_user_pages_lockless(struct page **pages, unsigned long npages) +{ + unsigned long i; + struct folio *folio; + unsigned int nr; + + /* + * Don't perform any sanity checks because we might have raced with + * fork() and some anonymous pages might now actually be shared -- + * which is why we're unpinning after all. + */ + for (i = 0; i < npages; i += nr) { + folio = gup_folio_next(pages, npages, i, &nr); + gup_put_folio(folio, nr, FOLL_PIN); + } +} + +/** + * unpin_user_pages() - release an array of gup-pinned pages. + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, release the page using unpin_user_page(). + * + * Please see the unpin_user_page() documentation for details. + */ +void unpin_user_pages(struct page **pages, unsigned long npages) +{ + unsigned long i; + struct folio *folio; + unsigned int nr; + + /* + * If this WARN_ON() fires, then the system *might* be leaking pages (by + * leaving them pinned), but probably not. More likely, gup/pup returned + * a hard -ERRNO error to the caller, who erroneously passed it here. + */ + if (WARN_ON(IS_ERR_VALUE(npages))) + return; + + sanity_check_pinned_pages(pages, npages); + for (i = 0; i < npages; i += nr) { + folio = gup_folio_next(pages, npages, i, &nr); + gup_put_folio(folio, nr, FOLL_PIN); + } +} +EXPORT_SYMBOL(unpin_user_pages); + +/* + * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's + * lifecycle. Avoid setting the bit unless necessary, or it might cause write + * cache bouncing on large SMP machines for concurrent pinned gups. + */ +static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +{ + if (!test_bit(MMF_HAS_PINNED, mm_flags)) + set_bit(MMF_HAS_PINNED, mm_flags); +} + +#ifdef CONFIG_MMU +static struct page *no_page_table(struct vm_area_struct *vma, + unsigned int flags) +{ + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && + (vma_is_anonymous(vma) || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return NULL; +} + +static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, unsigned int flags) +{ + if (flags & FOLL_TOUCH) { + pte_t entry = *pte; + + if (flags & FOLL_WRITE) + entry = pte_mkdirty(entry); + entry = pte_mkyoung(entry); + + if (!pte_same(*pte, entry)) { + set_pte_at(vma->vm_mm, address, pte, entry); + update_mmu_cache(vma, address, pte); + } + } + + /* Proper page table entry exists, but no corresponding struct page */ + return -EEXIST; +} + +/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ +static inline bool can_follow_write_pte(pte_t pte, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pte is writable, we can write to the page. */ + if (pte_write(pte)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + return false; + return !userfaultfd_pte_wp(vma, pte); +} + +static struct page *follow_page_pte(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags, + struct dev_pagemap **pgmap) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + spinlock_t *ptl; + pte_t *ptep, pte; + int ret; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return ERR_PTR(-EINVAL); + + /* + * Considering PTE level hugetlb, like continuous-PTE hugetlb on + * ARM64 architecture. + */ + if (is_vm_hugetlb_page(vma)) { + page = follow_huge_pmd_pte(vma, address, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + +retry: + if (unlikely(pmd_bad(*pmd))) + return no_page_table(vma, flags); + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto retry; + } + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) + goto no_page; + + page = vm_normal_page(vma, address, pte); + + /* + * We only care about anon pages in can_follow_write_pte() and don't + * have to worry about pte_devmap() because they are never anon. + */ + if ((flags & FOLL_WRITE) && + !can_follow_write_pte(pte, page, vma, flags)) { + page = NULL; + goto out; + } + + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { + /* + * Only return device mapping pages in the FOLL_GET or FOLL_PIN + * case since they are only valid while holding the pgmap + * reference. + */ + *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); + if (*pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { + if (flags & FOLL_DUMP) { + /* Avoid special (like zero) pages in core dumps */ + page = ERR_PTR(-EFAULT); + goto out; + } + + if (is_zero_pfn(pte_pfn(pte))) { + page = pte_page(pte); + } else { + ret = follow_pfn_pte(vma, address, ptep, flags); + page = ERR_PTR(ret); + goto out; + } + } + + if (!pte_write(pte) && gup_must_unshare(flags, page)) { + page = ERR_PTR(-EMLINK); + goto out; + } + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + + /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ + if (unlikely(!try_grab_page(page, flags))) { + page = ERR_PTR(-ENOMEM); + goto out; + } + /* + * We need to make the page accessible if and only if we are going + * to access its content (the FOLL_PIN case). Please see + * Documentation/core-api/pin_user_pages.rst for details. + */ + if (flags & FOLL_PIN) { + ret = arch_make_page_accessible(page); + if (ret) { + unpin_user_page(page); + page = ERR_PTR(ret); + goto out; + } + } + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } +out: + pte_unmap_unlock(ptep, ptl); + return page; +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return NULL; + return no_page_table(vma, flags); +} + +static struct page *follow_pmd_mask(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + unsigned int flags, + struct follow_page_context *ctx) +{ + pmd_t *pmd, pmdval; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + pmd = pmd_offset(pudp, address); + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) + return no_page_table(vma, flags); + if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { + page = follow_huge_pmd_pte(vma, address, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { + page = follow_huge_pd(vma, address, + __hugepd(pmd_val(pmdval)), flags, + PMD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } +retry: + if (!pmd_present(pmdval)) { + /* + * Should never reach here, if thp migration is not supported; + * Otherwise, it must be a thp migration entry. + */ + VM_BUG_ON(!thp_migration_supported() || + !is_pmd_migration_entry(pmdval)); + + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + + pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_lock is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); + goto retry; + } + if (pmd_devmap(pmdval)) { + ptl = pmd_lock(mm, pmd); + page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); + spin_unlock(ptl); + if (page) + return page; + } + if (likely(!pmd_trans_huge(pmdval))) + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) + return no_page_table(vma, flags); + +retry_locked: + ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } + if (unlikely(!pmd_present(*pmd))) { + spin_unlock(ptl); + if (likely(!(flags & FOLL_MIGRATION))) + return no_page_table(vma, flags); + pmd_migration_entry_wait(mm, pmd); + goto retry_locked; + } + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + } + if (flags & FOLL_SPLIT_PMD) { + int ret; + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + ret = 0; + split_huge_pmd(vma, pmd, address); + if (pmd_trans_unstable(pmd)) + ret = -EBUSY; + } else { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, address); + ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; + } + + return ret ? ERR_PTR(ret) : + follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + } + page = follow_trans_huge_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + ctx->page_mask = HPAGE_PMD_NR - 1; + return page; +} + +static struct page *follow_pud_mask(struct vm_area_struct *vma, + unsigned long address, p4d_t *p4dp, + unsigned int flags, + struct follow_page_context *ctx) +{ + pud_t *pud; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + pud = pud_offset(p4dp, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pud_val(*pud)))) { + page = follow_huge_pd(vma, address, + __hugepd(pud_val(*pud)), flags, + PUD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); + spin_unlock(ptl); + if (page) + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + return follow_pmd_mask(vma, address, pud, flags, ctx); +} + +static struct page *follow_p4d_mask(struct vm_area_struct *vma, + unsigned long address, pgd_t *pgdp, + unsigned int flags, + struct follow_page_context *ctx) +{ + p4d_t *p4d; + struct page *page; + + p4d = p4d_offset(pgdp, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + + if (is_hugepd(__hugepd(p4d_val(*p4d)))) { + page = follow_huge_pd(vma, address, + __hugepd(p4d_val(*p4d)), flags, + P4D_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + return follow_pud_mask(vma, address, p4d, flags, ctx); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a + * pointer to output page_mask + * + * @flags can have FOLL_ flags set, defined in + * + * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches + * the device's dev_pagemap metadata to avoid repeating expensive lookups. + * + * When getting an anonymous page and the caller has to trigger unsharing + * of a shared anonymous page first, -EMLINK is returned. The caller should + * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only + * relevant with FOLL_PIN and !FOLL_WRITE. + * + * On output, the @ctx->page_mask is set according to the size of the page. + * + * Return: the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +static struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + struct follow_page_context *ctx) +{ + pgd_t *pgd; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + ctx->page_mask = 0; + + /* make this handle hugepd */ + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); + return page; + } + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + if (pgd_huge(*pgd)) { + page = follow_huge_pgd(mm, address, pgd, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (is_hugepd(__hugepd(pgd_val(*pgd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pgd_val(*pgd)), flags, + PGDIR_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } + + return follow_p4d_mask(vma, address, pgd, flags, ctx); +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + struct follow_page_context ctx = { NULL }; + struct page *page; + + if (vma_is_secretmem(vma)) + return NULL; + + if (foll_flags & FOLL_PIN) + return NULL; + + page = follow_page_mask(vma, address, foll_flags, &ctx); + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return page; +} + +static int get_gate_page(struct mm_struct *mm, unsigned long address, + unsigned int gup_flags, struct vm_area_struct **vma, + struct page **page) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = -EFAULT; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + return -EFAULT; + if (address > TASK_SIZE) + pgd = pgd_offset_k(address); + else + pgd = pgd_offset_gate(mm, address); + if (pgd_none(*pgd)) + return -EFAULT; + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return -EFAULT; + pud = pud_offset(p4d, address); + if (pud_none(*pud)) + return -EFAULT; + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, address); + if (pte_none(*pte)) + goto unmap; + *vma = get_gate_vma(mm); + if (!page) + goto out; + *page = vm_normal_page(*vma, address, *pte); + if (!*page) { + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + goto unmap; + *page = pte_page(*pte); + } + if (unlikely(!try_grab_page(*page, gup_flags))) { + ret = -ENOMEM; + goto unmap; + } +out: + ret = 0; +unmap: + pte_unmap(pte); + return ret; +} + +/* + * mmap_lock must be held on entry. If @locked != NULL and *@flags + * does not include FOLL_NOWAIT, the mmap_lock may be released. If it + * is, *@locked will be set to 0 and -EBUSY returned. + */ +static int faultin_page(struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, bool unshare, + int *locked) +{ + unsigned int fault_flags = 0; + vm_fault_t ret; + + if (*flags & FOLL_NOFAULT) + return -EFAULT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) + fault_flags |= FAULT_FLAG_REMOTE; + if (locked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + if (*flags & FOLL_TRIED) { + /* + * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED + * can co-exist + */ + fault_flags |= FAULT_FLAG_TRIED; + } + if (unshare) { + fault_flags |= FAULT_FLAG_UNSHARE; + /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ + VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); + } + + ret = handle_mm_fault(vma, address, fault_flags, NULL); + + if (ret & VM_FAULT_COMPLETED) { + /* + * With FAULT_FLAG_RETRY_NOWAIT we'll never release the + * mmap lock in the page fault handler. Sanity check this. + */ + WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); + if (locked) + *locked = 0; + /* + * We should do the same as VM_FAULT_RETRY, but let's not + * return -EBUSY since that's not reflecting the reality of + * what has happened - we've just fully completed a page + * fault, with the mmap lock released. Use -EAGAIN to show + * that we want to take the mmap lock _again_. + */ + return -EAGAIN; + } + + if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, *flags); + + if (err) + return err; + BUG(); + } + + if (ret & VM_FAULT_RETRY) { + if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + *locked = 0; + return -EBUSY; + } + + return 0; +} + +static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) +{ + vm_flags_t vm_flags = vma->vm_flags; + int write = (gup_flags & FOLL_WRITE); + int foreign = (gup_flags & FOLL_REMOTE); + + if (vm_flags & (VM_IO | VM_PFNMAP)) + return -EFAULT; + + if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) + return -EFAULT; + + if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) + return -EOPNOTSUPP; + + if (vma_is_secretmem(vma)) + return -EFAULT; + + if (write) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ + if (is_vm_hugetlb_page(vma)) + return -EFAULT; + /* + * We used to let the write,force case do COW in a + * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could + * set a breakpoint in a read-only mapping of an + * executable, without corrupting the file (yet only + * when that file had been opened for writing!). + * Anon pages in shared mappings are surprising: now + * just reject it. + */ + if (!is_cow_mapping(vm_flags)) + return -EFAULT; + } + } else if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * Is there actually any vma we can reach here which does not + * have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + return -EFAULT; + } + /* + * gups are always data accesses, not instruction + * fetches, so execute=false here + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) + return -EFAULT; + return 0; +} + +/** + * __get_user_pages() - pin user pages in memory + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @locked: whether we're still with the mmap_lock held + * + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * -- 0 return value is possible when the fault would need to be retried. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_lock is held. + * + * Must be called with mmap_lock held. It may be released. See below. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @locked != NULL, *@locked will be set to 0 when mmap_lock is + * released by an up_read(). That can happen if @gup_flags does not + * have FOLL_NOWAIT. + * + * A caller using such a combination of @locked and @gup_flags + * must therefore hold the mmap_lock for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +static long __get_user_pages(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + long ret = 0, i = 0; + struct vm_area_struct *vma = NULL; + struct follow_page_context ctx = { NULL }; + + if (!nr_pages) + return 0; + + start = untagged_addr(start); + + VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = vma_lookup(mm, start); + if (!vma && in_gate_area(mm, start)) { + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, + pages ? &pages[i] : NULL); + if (ret) + goto out; + ctx.page_mask = 0; + goto next_page; + } + + if (!vma) { + ret = -EFAULT; + goto out; + } + ret = check_vma_flags(vma, gup_flags); + if (ret) + goto out; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, + gup_flags, locked); + if (locked && *locked == 0) { + /* + * We've got a VM_FAULT_RETRY + * and we've lost mmap_lock. + * We must stop here. + */ + BUG_ON(gup_flags & FOLL_NOWAIT); + goto out; + } + continue; + } + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + cond_resched(); + + page = follow_page_mask(vma, start, foll_flags, &ctx); + if (!page || PTR_ERR(page) == -EMLINK) { + ret = faultin_page(vma, start, &foll_flags, + PTR_ERR(page) == -EMLINK, locked); + switch (ret) { + case 0: + goto retry; + case -EBUSY: + case -EAGAIN: + ret = 0; + fallthrough; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + goto out; + } + BUG(); + } else if (PTR_ERR(page) == -EEXIST) { + /* + * Proper page table entry exists, but no corresponding + * struct page. If the caller expects **pages to be + * filled in, bail out now, because that can't be done + * for this page. + */ + if (pages) { + ret = PTR_ERR(page); + goto out; + } + + goto next_page; + } else if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + if (pages) { + pages[i] = page; + flush_anon_page(vma, page, start); + flush_dcache_page(page); + ctx.page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + ctx.page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages); +out: + if (ctx.pgmap) + put_dev_pagemap(ctx.pgmap); + return i ? i : ret; +} + +static bool vma_permits_fault(struct vm_area_struct *vma, + unsigned int fault_flags) +{ + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); + vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; + + if (!(vm_flags & vma->vm_flags)) + return false; + + /* + * The architecture might have a hardware protection + * mechanism other than read/write that can deny access. + * + * gup always represents data access, not instruction + * fetches, so execute=false here: + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) + return false; + + return true; +} + +/** + * fixup_user_fault() - manually resolve a user page fault + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * get_user_pages() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This function will not return with an unlocked mmap_lock. So it has not the + * same semantics wrt the @mm->mmap_lock as does filemap_fault(). + */ +int fixup_user_fault(struct mm_struct *mm, + unsigned long address, unsigned int fault_flags, + bool *unlocked) +{ + struct vm_area_struct *vma; + vm_fault_t ret; + + address = untagged_addr(address); + + if (unlocked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + +retry: + vma = vma_lookup(mm, address); + if (!vma) + return -EFAULT; + + if (!vma_permits_fault(vma, fault_flags)) + return -EFAULT; + + if ((fault_flags & FAULT_FLAG_KILLABLE) && + fatal_signal_pending(current)) + return -EINTR; + + ret = handle_mm_fault(vma, address, fault_flags, NULL); + + if (ret & VM_FAULT_COMPLETED) { + /* + * NOTE: it's a pity that we need to retake the lock here + * to pair with the unlock() in the callers. Ideally we + * could tell the callers so they do not need to unlock. + */ + mmap_read_lock(mm); + *unlocked = true; + return 0; + } + + if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, 0); + + if (err) + return err; + BUG(); + } + + if (ret & VM_FAULT_RETRY) { + mmap_read_lock(mm); + *unlocked = true; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fixup_user_fault); + +/* + * Please note that this function, unlike __get_user_pages will not + * return 0 for nr_pages > 0 without FOLL_NOWAIT + */ +static __always_inline long __get_user_pages_locked(struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, + unsigned int flags) +{ + long ret, pages_done; + bool lock_dropped; + + if (locked) { + /* if VM_FAULT_RETRY can be returned, vmas become invalid */ + BUG_ON(vmas); + /* check caller initialized locked */ + BUG_ON(*locked != 1); + } + + if (flags & FOLL_PIN) + mm_set_has_pinned_flag(&mm->flags); + + /* + * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior + * is to set FOLL_GET if the caller wants pages[] filled in (but has + * carelessly failed to specify FOLL_GET), so keep doing that, but only + * for FOLL_GET, not for the newer FOLL_PIN. + * + * FOLL_PIN always expects pages to be non-null, but no need to assert + * that here, as any failures will be obvious enough. + */ + if (pages && !(flags & FOLL_PIN)) + flags |= FOLL_GET; + + pages_done = 0; + lock_dropped = false; + for (;;) { + ret = __get_user_pages(mm, start, nr_pages, flags, pages, + vmas, locked); + if (!locked) + /* VM_FAULT_RETRY couldn't trigger, bypass */ + return ret; + + /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ + if (!*locked) { + BUG_ON(ret < 0); + BUG_ON(ret >= nr_pages); + } + + if (ret > 0) { + nr_pages -= ret; + pages_done += ret; + if (!nr_pages) + break; + } + if (*locked) { + /* + * VM_FAULT_RETRY didn't trigger or it was a + * FOLL_NOWAIT. + */ + if (!pages_done) + pages_done = ret; + break; + } + /* + * VM_FAULT_RETRY triggered, so seek to the faulting offset. + * For the prefault case (!pages) we only update counts. + */ + if (likely(pages)) + pages += ret; + start += ret << PAGE_SHIFT; + lock_dropped = true; + +retry: + /* + * Repeat on the address that fired VM_FAULT_RETRY + * with both FAULT_FLAG_ALLOW_RETRY and + * FAULT_FLAG_TRIED. Note that GUP can be interrupted + * by fatal signals, so we need to check it before we + * start trying again otherwise it can loop forever. + */ + + if (fatal_signal_pending(current)) { + if (!pages_done) + pages_done = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) { + BUG_ON(ret > 0); + if (!pages_done) + pages_done = ret; + break; + } + + *locked = 1; + ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, + pages, NULL, locked); + if (!*locked) { + /* Continue to retry until we succeeded */ + BUG_ON(ret != 0); + goto retry; + } + if (ret != 1) { + BUG_ON(ret > 1); + if (!pages_done) + pages_done = ret; + break; + } + nr_pages--; + pages_done++; + if (!nr_pages) + break; + if (likely(pages)) + pages++; + start += PAGE_SIZE; + } + if (lock_dropped && *locked) { + /* + * We must let the caller know we temporarily dropped the lock + * and so the critical section protected by it was lost. + */ + mmap_read_unlock(mm); + *locked = 0; + } + return pages_done; +} + +/** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @locked: whether the mmap_lock is still held + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * Return either number of pages pinned in the vma, or a negative error + * code on error. + * + * vma->vm_mm->mmap_lock must be held. + * + * If @locked is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @locked is non-NULL, it must held for read only and may be + * released. If it's released, *@locked will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *locked) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + long ret; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + mmap_assert_locked(mm); + + /* + * Rightly or wrongly, the VM_LOCKONFAULT case has never used + * faultin_page() to break COW, so it has no work to do here. + */ + if (vma->vm_flags & VM_LOCKONFAULT) + return nr_pages; + + gup_flags = FOLL_TOUCH; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma_is_accessible(vma)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + ret = __get_user_pages(mm, start, nr_pages, gup_flags, + NULL, NULL, locked); + lru_add_drain(); + return ret; +} + +/* + * faultin_vma_page_range() - populate (prefault) page tables inside the + * given VMA range readable/writable + * + * This takes care of mlocking the pages, too, if VM_LOCKED is set. + * + * @vma: target vma + * @start: start address + * @end: end address + * @write: whether to prefault readable or writable + * @locked: whether the mmap_lock is still held + * + * Returns either number of processed pages in the vma, or a negative error + * code on error (see __get_user_pages()). + * + * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and + * covered by the VMA. + * + * If @locked is NULL, it may be held for read or write and will be unperturbed. + * + * If @locked is non-NULL, it must held for read only and may be released. If + * it's released, *@locked will be set to 0. + */ +long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool write, int *locked) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + long ret; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + mmap_assert_locked(mm); + + /* + * FOLL_TOUCH: Mark page accessed and thereby young; will also mark + * the page dirty with FOLL_WRITE -- which doesn't make a + * difference with !FOLL_FORCE, because the page is writable + * in the page table. + * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit + * a poisoned page. + * !FOLL_FORCE: Require proper access permissions. + */ + gup_flags = FOLL_TOUCH | FOLL_HWPOISON; + if (write) + gup_flags |= FOLL_WRITE; + + /* + * We want to report -EINVAL instead of -EFAULT for any permission + * problems or incompatible mappings. + */ + if (check_vma_flags(vma, gup_flags)) + return -EINVAL; + + ret = __get_user_pages(mm, start, nr_pages, gup_flags, + NULL, NULL, locked); + lru_add_drain(); + return ret; +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_lock must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + mmap_read_lock(mm); + vma = find_vma_intersection(mm, nstart, end); + } else if (nstart >= vma->vm_end) + vma = find_vma_intersection(mm, vma->vm_end, end); + + if (!vma) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + mmap_read_unlock(mm); + return ret; /* 0 or negative error code */ +} +#else /* CONFIG_MMU */ +static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + struct vm_area_struct **vmas, int *locked, + unsigned int foll_flags) +{ + struct vm_area_struct *vma; + unsigned long vm_flags; + long i; + + /* calculate required read or write permissions. + * If FOLL_FORCE is set, we only require the "MAY" flags. + */ + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + + for (i = 0; i < nr_pages; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + + if (pages) { + pages[i] = virt_to_page((void *)start); + if (pages[i]) + get_page(pages[i]); + } + if (vmas) + vmas[i] = vma; + start = (start + PAGE_SIZE) & PAGE_MASK; + } + + return i; + +finish_or_fault: + return i ? : -EFAULT; +} +#endif /* !CONFIG_MMU */ + +/** + * fault_in_writeable - fault in userspace address range for writing + * @uaddr: start of address range + * @size: size of address range + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_writeable(char __user *uaddr, size_t size) +{ + char __user *start = uaddr, *end; + + if (unlikely(size == 0)) + return 0; + if (!user_write_access_begin(uaddr, size)) + return size; + if (!PAGE_ALIGNED(uaddr)) { + unsafe_put_user(0, uaddr, out); + uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); + } + end = (char __user *)PAGE_ALIGN((unsigned long)start + size); + if (unlikely(end < start)) + end = NULL; + while (uaddr != end) { + unsafe_put_user(0, uaddr, out); + uaddr += PAGE_SIZE; + } + +out: + user_write_access_end(); + if (size > uaddr - start) + return size - (uaddr - start); + return 0; +} +EXPORT_SYMBOL(fault_in_writeable); + +/** + * fault_in_subpage_writeable - fault in an address range for writing + * @uaddr: start of address range + * @size: size of address range + * + * Fault in a user address range for writing while checking for permissions at + * sub-page granularity (e.g. arm64 MTE). This function should be used when + * the caller cannot guarantee forward progress of a copy_to_user() loop. + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) +{ + size_t faulted_in; + + /* + * Attempt faulting in at page granularity first for page table + * permission checking. The arch-specific probe_subpage_writeable() + * functions may not check for this. + */ + faulted_in = size - fault_in_writeable(uaddr, size); + if (faulted_in) + faulted_in -= probe_subpage_writeable(uaddr, faulted_in); + + return size - faulted_in; +} +EXPORT_SYMBOL(fault_in_subpage_writeable); + +/* + * fault_in_safe_writeable - fault in an address range for writing + * @uaddr: start of address range + * @size: length of address range + * + * Faults in an address range for writing. This is primarily useful when we + * already know that some or all of the pages in the address range aren't in + * memory. + * + * Unlike fault_in_writeable(), this function is non-destructive. + * + * Note that we don't pin or otherwise hold the pages referenced that we fault + * in. There's no guarantee that they'll stay in memory for any duration of + * time. + * + * Returns the number of bytes not faulted in, like copy_to_user() and + * copy_from_user(). + */ +size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) +{ + unsigned long start = (unsigned long)uaddr, end; + struct mm_struct *mm = current->mm; + bool unlocked = false; + + if (unlikely(size == 0)) + return 0; + end = PAGE_ALIGN(start + size); + if (end < start) + end = 0; + + mmap_read_lock(mm); + do { + if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) + break; + start = (start + PAGE_SIZE) & PAGE_MASK; + } while (start != end); + mmap_read_unlock(mm); + + if (size > (unsigned long)uaddr - start) + return size - ((unsigned long)uaddr - start); + return 0; +} +EXPORT_SYMBOL(fault_in_safe_writeable); + +/** + * fault_in_readable - fault in userspace address range for reading + * @uaddr: start of user address range + * @size: size of user address range + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_readable(const char __user *uaddr, size_t size) +{ + const char __user *start = uaddr, *end; + volatile char c; + + if (unlikely(size == 0)) + return 0; + if (!user_read_access_begin(uaddr, size)) + return size; + if (!PAGE_ALIGNED(uaddr)) { + unsafe_get_user(c, uaddr, out); + uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); + } + end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); + if (unlikely(end < start)) + end = NULL; + while (uaddr != end) { + unsafe_get_user(c, uaddr, out); + uaddr += PAGE_SIZE; + } + +out: + user_read_access_end(); + (void)c; + if (size > uaddr - start) + return size - (uaddr - start); + return 0; +} +EXPORT_SYMBOL(fault_in_readable); + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save disk space. + * + * Called without mmap_lock (takes and releases the mmap_lock by itself). + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct mm_struct *mm = current->mm; + struct page *page; + int locked = 1; + int ret; + + if (mmap_read_lock_killable(mm)) + return NULL; + ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked, + FOLL_FORCE | FOLL_DUMP | FOLL_GET); + if (locked) + mmap_read_unlock(mm); + return (ret == 1) ? page : NULL; +} +#endif /* CONFIG_ELF_CORE */ + +#ifdef CONFIG_MIGRATION +/* + * Returns the number of collected pages. Return value is always >= 0. + */ +static unsigned long collect_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) +{ + unsigned long i, collected = 0; + struct folio *prev_folio = NULL; + bool drain_allow = true; + + for (i = 0; i < nr_pages; i++) { + struct folio *folio = page_folio(pages[i]); + + if (folio == prev_folio) + continue; + prev_folio = folio; + + if (folio_is_longterm_pinnable(folio)) + continue; + + collected++; + + if (folio_is_device_coherent(folio)) + continue; + + if (folio_test_hugetlb(folio)) { + isolate_hugetlb(&folio->page, movable_page_list); + continue; + } + + if (!folio_test_lru(folio) && drain_allow) { + lru_add_drain_all(); + drain_allow = false; + } + + if (folio_isolate_lru(folio)) + continue; + + list_add_tail(&folio->lru, movable_page_list); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + } + + return collected; +} + +/* + * Unpins all pages and migrates device coherent pages and movable_page_list. + * Returns -EAGAIN if all pages were successfully migrated or -errno for failure + * (or partial success). + */ +static int migrate_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) +{ + int ret; + unsigned long i; + + for (i = 0; i < nr_pages; i++) { + struct folio *folio = page_folio(pages[i]); + + if (folio_is_device_coherent(folio)) { + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + pages[i] = NULL; + folio_get(folio); + gup_put_folio(folio, 1, FOLL_PIN); + + if (migrate_device_coherent_page(&folio->page)) { + ret = -EBUSY; + goto err; + } + + continue; + } + + /* + * We can't migrate pages with unexpected references, so drop + * the reference obtained by __get_user_pages_locked(). + * Migrating pages have been added to movable_page_list after + * calling folio_isolate_lru() which takes a reference so the + * page won't be freed if it's migrating. + */ + unpin_user_page(pages[i]); + pages[i] = NULL; + } + + if (!list_empty(movable_page_list)) { + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_NOWARN, + }; + + if (migrate_pages(movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN, NULL)) { + ret = -ENOMEM; + goto err; + } + } + + putback_movable_pages(movable_page_list); + + return -EAGAIN; + +err: + for (i = 0; i < nr_pages; i++) + if (pages[i]) + unpin_user_page(pages[i]); + putback_movable_pages(movable_page_list); + + return ret; +} + +/* + * Check whether all pages are *allowed* to be pinned. Rather confusingly, all + * pages in the range are required to be pinned via FOLL_PIN, before calling + * this routine. + * + * If any pages in the range are not allowed to be pinned, then this routine + * will migrate those pages away, unpin all the pages in the range and return + * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then + * call this routine again. + * + * If an error other than -EAGAIN occurs, this indicates a migration failure. + * The caller should give up, and propagate the error back up the call stack. + * + * If everything is OK and all pages in the range are allowed to be pinned, then + * this routine leaves all pages pinned and returns zero for success. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + unsigned long collected; + LIST_HEAD(movable_page_list); + + collected = collect_longterm_unpinnable_pages(&movable_page_list, + nr_pages, pages); + if (!collected) + return 0; + + return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, + pages); +} +#else +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + return 0; +} +#endif /* CONFIG_MIGRATION */ + +/* + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which + * allows us to process the FOLL_LONGTERM flag. + */ +static long __gup_longterm_locked(struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) +{ + unsigned int flags; + long rc, nr_pinned_pages; + + if (!(gup_flags & FOLL_LONGTERM)) + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + + /* + * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM + * implies FOLL_PIN (although the reverse is not true). Therefore it is + * correct to unconditionally call check_and_migrate_movable_pages() + * which assumes pages have been pinned via FOLL_PIN. + * + * Enforce the above reasoning by asserting that FOLL_PIN is set. + */ + if (WARN_ON(!(gup_flags & FOLL_PIN))) + return -EINVAL; + flags = memalloc_pin_save(); + do { + nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if (nr_pinned_pages <= 0) { + rc = nr_pinned_pages; + break; + } + rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); + } while (rc == -EAGAIN); + memalloc_pin_restore(flags); + + return rc ? rc : nr_pinned_pages; +} + +static bool is_valid_gup_flags(unsigned int gup_flags) +{ + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return false; + /* + * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying + * that is, FOLL_LONGTERM is a specific case, more restrictive case of + * FOLL_PIN. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return false; + + return true; +} + +#ifdef CONFIG_MMU +static long __get_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + /* + * Parts of FOLL_LONGTERM behavior are incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. However, this only comes up if locked is set, and there are + * callers that do request FOLL_LONGTERM, but do not set locked. So, + * allow what we can. + */ + if (gup_flags & FOLL_LONGTERM) { + if (WARN_ON_ONCE(locked)) + return -EINVAL; + /* + * This will check the vmas (even if our vmas arg is NULL) + * and return -ENOTSUPP if DAX isn't allowed in this case: + */ + return __gup_longterm_locked(mm, start, nr_pages, pages, + vmas, gup_flags | FOLL_TOUCH | + FOLL_REMOTE); + } + + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); +} + +/** + * get_user_pages_remote() - pin user pages in memory + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_lock is held. + * + * Must be called with mmap_lock held for read or write. + * + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages_remote returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page + * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must + * be called after the page is finished with, and before put_page is called. + * + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + * + * get_user_pages_remote should be phased out in favor of + * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing + * should use get_user_pages_remote because it cannot pass + * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. + */ +long get_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + if (!is_valid_gup_flags(gup_flags)) + return -EINVAL; + + return __get_user_pages_remote(mm, start, nr_pages, gup_flags, + pages, vmas, locked); +} +EXPORT_SYMBOL(get_user_pages_remote); + +#else /* CONFIG_MMU */ +long get_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + return 0; +} + +static long __get_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + return 0; +} +#endif /* !CONFIG_MMU */ + +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * This is the same as get_user_pages_remote(), just with a less-flexible + * calling convention where we assume that the mm being operated on belongs to + * the current task, and doesn't allow passing of a locked parameter. We also + * obviously don't pass FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + if (!is_valid_gup_flags(gup_flags)) + return -EINVAL; + + return __gup_longterm_locked(current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); + +/* + * get_user_pages_unlocked() is suitable to replace the form: + * + * mmap_read_lock(mm); + * get_user_pages(mm, ..., pages, NULL); + * mmap_read_unlock(mm); + * + * with: + * + * get_user_pages_unlocked(mm, ..., pages); + * + * It is functionally equivalent to get_user_pages_fast so + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. + */ +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + struct mm_struct *mm = current->mm; + int locked = 1; + long ret; + + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + mmap_read_lock(mm); + ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, + &locked, gup_flags | FOLL_TOUCH); + if (locked) + mmap_read_unlock(mm); + return ret; +} +EXPORT_SYMBOL(get_user_pages_unlocked); + +/* + * Fast GUP + * + * get_user_pages_fast attempts to pin user pages by walking the page + * tables directly and avoids taking locks. Thus the walker needs to be + * protected from page table pages being freed from under it, and should + * block any THP splits. + * + * One way to achieve this is to have the walker disable interrupts, and + * rely on IPIs from the TLB flushing code blocking before the page table + * pages are freed. This is unsuitable for architectures that do not need + * to broadcast an IPI when invalidating TLBs. + * + * Another way to achieve this is to batch up page table containing pages + * belonging to more than one mm_user, then rcu_sched a callback to free those + * pages. Disabling interrupts will allow the fast_gup walker to both block + * the rcu_sched callback, and an IPI that we broadcast for splitting THPs + * (which is a relatively rare event). The code below adopts this strategy. + * + * Before activating this code, please be aware that the following assumptions + * are currently made: + * + * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. + * + * *) ptes can be read atomically by the architecture. + * + * *) access_ok is sufficient to validate userspace address ranges. + * + * The last two assumptions can be relaxed by the addition of helper functions. + * + * This code is based heavily on the PowerPC implementation by Nick Piggin. + */ +#ifdef CONFIG_HAVE_FAST_GUP + +static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, + unsigned int flags, + struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + if (flags & FOLL_PIN) + unpin_user_page(page); + else + put_page(page); + } +} + +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL +/* + * Fast-gup relies on pte change detection to avoid concurrent pgtable + * operations. + * + * To pin the page, fast-gup needs to do below in order: + * (1) pin the page (by prefetching pte), then (2) check pte not changed. + * + * For the rest of pgtable operations where pgtable updates can be racy + * with fast-gup, we need to do (1) clear pte, then (2) check whether page + * is pinned. + * + * Above will work for all pte-level operations, including THP split. + * + * For THP collapse, it's a bit more complicated because fast-gup may be + * walking a pgtable page that is being freed (pte is still valid but pmd + * can be cleared already). To avoid race in such condition, we need to + * also check pmd here to make sure pmd doesn't change (corresponds to + * pmdp_collapse_flush() in the THP collapse code path). + */ +static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + struct dev_pagemap *pgmap = NULL; + int nr_start = *nr, ret = 0; + pte_t *ptep, *ptem; + + ptem = ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = ptep_get_lockless(ptep); + struct page *page; + struct folio *folio; + + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) + goto pte_unmap; + + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + goto pte_unmap; + + if (pte_devmap(pte)) { + if (unlikely(flags & FOLL_LONGTERM)) + goto pte_unmap; + + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, flags, pages); + goto pte_unmap; + } + } else if (pte_special(pte)) + goto pte_unmap; + + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + + folio = try_grab_folio(page, 1, flags); + if (!folio) + goto pte_unmap; + + if (unlikely(page_is_secretmem(page))) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + + if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || + unlikely(pte_val(pte) != pte_val(*ptep))) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + + if (!pte_write(pte) && gup_must_unshare(flags, page)) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + + /* + * We need to make the page accessible if and only if we are + * going to access its content (the FOLL_PIN case). Please + * see Documentation/core-api/pin_user_pages.rst for + * details. + */ + if (flags & FOLL_PIN) { + ret = arch_make_page_accessible(page); + if (ret) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; + } + } + folio_set_referenced(folio); + pages[*nr] = page; + (*nr)++; + } while (ptep++, addr += PAGE_SIZE, addr != end); + + ret = 1; + +pte_unmap: + if (pgmap) + put_dev_pagemap(pgmap); + pte_unmap(ptem); + return ret; +} +#else + +/* + * If we can't determine whether or not a pte is special, then fail immediately + * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not + * to be special. + * + * For a futex to be placed on a THP tail page, get_futex_key requires a + * get_user_pages_fast_only implementation that can pin pages. Thus it's still + * useful to have gup_huge_pmd even if we can't operate on ptes. + */ +static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ + +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +static int __gup_device_huge(unsigned long pfn, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + int nr_start = *nr; + struct dev_pagemap *pgmap = NULL; + + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, flags, pages); + break; + } + SetPageReferenced(page); + pages[*nr] = page; + if (unlikely(!try_grab_page(page, flags))) { + undo_dev_pagemap(nr, nr_start, flags, pages); + break; + } + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + + put_dev_pagemap(pgmap); + return addr == end; +} + +static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + unsigned long fault_pfn; + int nr_start = *nr; + + fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) + return 0; + + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + undo_dev_pagemap(nr, nr_start, flags, pages); + return 0; + } + return 1; +} + +static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + unsigned long fault_pfn; + int nr_start = *nr; + + fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) + return 0; + + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + undo_dev_pagemap(nr, nr_start, flags, pages); + return 0; + } + return 1; +} +#else +static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} + +static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} +#endif + +static int record_subpages(struct page *page, unsigned long addr, + unsigned long end, struct page **pages) +{ + int nr; + + for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) + pages[nr] = nth_page(page, nr); + + return nr; +} + +#ifdef CONFIG_ARCH_HAS_HUGEPD +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + unsigned long pte_end; + struct page *page; + struct folio *folio; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = huge_ptep_get(ptep); + + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); + refs = record_subpages(page, addr, end, pages + *nr); + + folio = try_grab_folio(page, refs, flags); + if (!folio) + return 0; + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + + *nr += refs; + folio_set_referenced(folio); + return 1; +} + +static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned int pdshift, unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} +#else +static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned int pdshift, unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + return 0; +} +#endif /* CONFIG_ARCH_HAS_HUGEPD */ + +static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + struct page *page; + struct folio *folio; + int refs; + + if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) + return 0; + + if (pmd_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; + return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, + pages, nr); + } + + page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); + refs = record_subpages(page, addr, end, pages + *nr); + + folio = try_grab_folio(page, refs, flags); + if (!folio) + return 0; + + if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + + *nr += refs; + folio_set_referenced(folio); + return 1; +} + +static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + struct page *page; + struct folio *folio; + int refs; + + if (!pud_access_permitted(orig, flags & FOLL_WRITE)) + return 0; + + if (pud_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; + return __gup_device_huge_pud(orig, pudp, addr, end, flags, + pages, nr); + } + + page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); + refs = record_subpages(page, addr, end, pages + *nr); + + folio = try_grab_folio(page, refs, flags); + if (!folio) + return 0; + + if (unlikely(pud_val(orig) != pud_val(*pudp))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + gup_put_folio(folio, refs, flags); + return 0; + } + + *nr += refs; + folio_set_referenced(folio); + return 1; +} + +static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, + unsigned long end, unsigned int flags, + struct page **pages, int *nr) +{ + int refs; + struct page *page; + struct folio *folio; + + if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) + return 0; + + BUILD_BUG_ON(pgd_devmap(orig)); + + page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); + refs = record_subpages(page, addr, end, pages + *nr); + + folio = try_grab_folio(page, refs, flags); + if (!folio) + return 0; + + if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { + gup_put_folio(folio, refs, flags); + return 0; + } + + *nr += refs; + folio_set_referenced(folio); + return 1; +} + +static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset_lockless(pudp, pud, addr); + do { + pmd_t pmd = READ_ONCE(*pmdp); + + next = pmd_addr_end(addr, end); + if (!pmd_present(pmd)) + return 0; + + if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || + pmd_devmap(pmd))) { + if (pmd_protnone(pmd) && + !gup_can_follow_protnone(flags)) + return 0; + + if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, + pages, nr)) + return 0; + + } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { + /* + * architecture have different format for hugetlbfs + * pmd format and THP pmd format + */ + if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, + PMD_SHIFT, next, flags, pages, nr)) + return 0; + } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) + return 0; + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset_lockless(p4dp, p4d, addr); + do { + pud_t pud = READ_ONCE(*pudp); + + next = pud_addr_end(addr, end); + if (unlikely(!pud_present(pud))) + return 0; + if (unlikely(pud_huge(pud) || pud_devmap(pud))) { + if (!gup_huge_pud(pud, pudp, addr, next, flags, + pages, nr)) + return 0; + } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { + if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, + PUD_SHIFT, next, flags, pages, nr)) + return 0; + } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + do { + p4d_t p4d = READ_ONCE(*p4dp); + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_huge(p4d)); + if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { + if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, + P4D_SHIFT, next, flags, pages, nr)) + return 0; + } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + +static void gup_pgd_range(unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ + unsigned long next; + pgd_t *pgdp; + + pgdp = pgd_offset(current->mm, addr); + do { + pgd_t pgd = READ_ONCE(*pgdp); + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + return; + if (unlikely(pgd_huge(pgd))) { + if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, + pages, nr)) + return; + } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { + if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, + PGDIR_SHIFT, next, flags, pages, nr)) + return; + } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) + return; + } while (pgdp++, addr = next, addr != end); +} +#else +static inline void gup_pgd_range(unsigned long addr, unsigned long end, + unsigned int flags, struct page **pages, int *nr) +{ +} +#endif /* CONFIG_HAVE_FAST_GUP */ + +#ifndef gup_fast_permitted +/* + * Check if it's allowed to use get_user_pages_fast_only() for the range, or + * we need to fall back to the slow version: + */ +static bool gup_fast_permitted(unsigned long start, unsigned long end) +{ + return true; +} +#endif + +static int __gup_longterm_unlocked(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int ret; + + /* + * FIXME: FOLL_LONGTERM does not work with + * get_user_pages_unlocked() (see comments in that function) + */ + if (gup_flags & FOLL_LONGTERM) { + mmap_read_lock(current->mm); + ret = __gup_longterm_locked(current->mm, + start, nr_pages, + pages, NULL, gup_flags); + mmap_read_unlock(current->mm); + } else { + ret = get_user_pages_unlocked(start, nr_pages, + pages, gup_flags); + } + + return ret; +} + +static unsigned long lockless_pages_from_mm(unsigned long start, + unsigned long end, + unsigned int gup_flags, + struct page **pages) +{ + unsigned long flags; + int nr_pinned = 0; + unsigned seq; + + if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + !gup_fast_permitted(start, end)) + return 0; + + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + + /* + * Disable interrupts. The nested form is used, in order to allow full, + * general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being freed + * from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock() here as we also want to block IPIs + * that come from THPs splitting. + */ + local_irq_save(flags); + gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); + local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages_lockless(pages, nr_pinned); + return 0; + } else { + sanity_check_pinned_pages(pages, nr_pinned); + } + } + return nr_pinned; +} + +static int internal_get_user_pages_fast(unsigned long start, + unsigned long nr_pages, + unsigned int gup_flags, + struct page **pages) +{ + unsigned long len, end; + unsigned long nr_pinned; + int ret; + + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | + FOLL_FORCE | FOLL_PIN | FOLL_GET | + FOLL_FAST_ONLY | FOLL_NOFAULT))) + return -EINVAL; + + if (gup_flags & FOLL_PIN) + mm_set_has_pinned_flag(¤t->mm->flags); + + if (!(gup_flags & FOLL_FAST_ONLY)) + might_lock_read(¤t->mm->mmap_lock); + + start = untagged_addr(start) & PAGE_MASK; + len = nr_pages << PAGE_SHIFT; + if (check_add_overflow(start, len, &end)) + return 0; + if (unlikely(!access_ok((void __user *)start, len))) + return -EFAULT; + + nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); + if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) + return nr_pinned; + + /* Slow path: try to get the remaining pages with get_user_pages */ + start += nr_pinned << PAGE_SHIFT; + pages += nr_pinned; + ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, + pages); + if (ret < 0) { + /* + * The caller has to unpin the pages we already pinned so + * returning -errno is not an option + */ + if (nr_pinned) + return nr_pinned; + return ret; + } + return ret + nr_pinned; +} + +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to + * the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + * + * If the architecture does not support this function, simply return with no + * pages pinned. + * + * Careful, careful! COW breaking can go either way, so a non-write + * access can get ambiguous page results. If you call this function without + * 'write' set, you'd better be sure that you're ok with that ambiguity. + */ +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + /* + * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, + * because gup fast is always a "pin with a +1 page refcount" request. + * + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; + + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + + /* + * As specified in the API description above, this routine is not + * allowed to return negative values. However, the common core + * routine internal_get_user_pages_fast() *can* return -errno. + * Therefore, correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_lock. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number requested. + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns + * -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + if (!is_valid_gup_flags(gup_flags)) + return -EINVAL; + + /* + * The caller may or may not have explicitly set FOLL_GET; either way is + * OK. However, internally (within mm/gup.c), gup fast variants must set + * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" + * request. + */ + gup_flags |= FOLL_GET; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); + +/** + * pin_user_pages_fast() - pin user pages in memory without taking locks + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See + * get_user_pages_fast() for documentation on the function arguments, because + * the arguments here are identical. + * + * FOLL_PIN means that the pages must be released via unpin_user_page(). Please + * see Documentation/core-api/pin_user_pages.rst for further details. + */ +int pin_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast); + +/* + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. + * + * The API rules are the same, too: no negative values may be returned. + */ +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int nr_pinned; + + /* + * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API + * rules require returning 0, rather than -errno: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return 0; + + if (WARN_ON_ONCE(!pages)) + return 0; + /* + * FOLL_FAST_ONLY is required in order to match the API description of + * this routine: no fall back to regular ("slow") GUP. + */ + gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY); + nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, + pages); + /* + * This routine is not allowed to return negative values. However, + * internal_get_user_pages_fast() *can* return -errno. Therefore, + * correct for that here: + */ + if (nr_pinned < 0) + nr_pinned = 0; + + return nr_pinned; +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); + +/** + * pin_user_pages_remote() - pin pages of a remote process + * + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See + * get_user_pages_remote() for documentation on the function arguments, because + * the arguments here are identical. + * + * FOLL_PIN means that the pages must be released via unpin_user_page(). Please + * see Documentation/core-api/pin_user_pages.rst for details. + */ +long pin_user_pages_remote(struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_remote(mm, start, nr_pages, gup_flags, + pages, vmas, locked); +} +EXPORT_SYMBOL(pin_user_pages_remote); + +/** + * pin_user_pages() - pin user pages in memory for use by other devices + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and + * FOLL_PIN is set. + * + * FOLL_PIN means that the pages must be released via unpin_user_page(). Please + * see Documentation/core-api/pin_user_pages.rst for details. + */ +long pin_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __gup_longterm_locked(current->mm, start, nr_pages, + pages, vmas, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + if (WARN_ON_ONCE(!pages)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); diff --git a/mm/gup_test.c b/mm/gup_test.c new file mode 100644 index 000000000..12b0a9176 --- /dev/null +++ b/mm/gup_test.c @@ -0,0 +1,250 @@ +#include +#include +#include +#include +#include +#include +#include "gup_test.h" + +static void put_back_pages(unsigned int cmd, struct page **pages, + unsigned long nr_pages, unsigned int gup_test_flags) +{ + unsigned long i; + + switch (cmd) { + case GUP_FAST_BENCHMARK: + case GUP_BASIC_TEST: + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + break; + + case PIN_FAST_BENCHMARK: + case PIN_BASIC_TEST: + case PIN_LONGTERM_BENCHMARK: + unpin_user_pages(pages, nr_pages); + break; + case DUMP_USER_PAGES_TEST: + if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + + } + break; + } +} + +static void verify_dma_pinned(unsigned int cmd, struct page **pages, + unsigned long nr_pages) +{ + unsigned long i; + struct page *page; + + switch (cmd) { + case PIN_FAST_BENCHMARK: + case PIN_BASIC_TEST: + case PIN_LONGTERM_BENCHMARK: + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + if (WARN(!page_maybe_dma_pinned(page), + "pages[%lu] is NOT dma-pinned\n", i)) { + + dump_page(page, "gup_test failure"); + break; + } else if (cmd == PIN_LONGTERM_BENCHMARK && + WARN(!is_longterm_pinnable_page(page), + "pages[%lu] is NOT pinnable but pinned\n", + i)) { + dump_page(page, "gup_test failure"); + break; + } + } + break; + } +} + +static void dump_pages_test(struct gup_test *gup, struct page **pages, + unsigned long nr_pages) +{ + unsigned int index_to_dump; + unsigned int i; + + /* + * Zero out any user-supplied page index that is out of range. Remember: + * .which_pages[] contains a 1-based set of page indices. + */ + for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) { + if (gup->which_pages[i] > nr_pages) { + pr_warn("ZEROING due to out of range: .which_pages[%u]: %u\n", + i, gup->which_pages[i]); + gup->which_pages[i] = 0; + } + } + + for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) { + index_to_dump = gup->which_pages[i]; + + if (index_to_dump) { + index_to_dump--; // Decode from 1-based, to 0-based + pr_info("---- page #%u, starting from user virt addr: 0x%llx\n", + index_to_dump, gup->addr); + dump_page(pages[index_to_dump], + "gup_test: dump_pages() test"); + } + } +} + +static int __gup_test_ioctl(unsigned int cmd, + struct gup_test *gup) +{ + ktime_t start_time, end_time; + unsigned long i, nr_pages, addr, next; + long nr; + struct page **pages; + int ret = 0; + bool needs_mmap_lock = + cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK; + + if (gup->size > ULONG_MAX) + return -EINVAL; + + nr_pages = gup->size / PAGE_SIZE; + pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) { + ret = -EINTR; + goto free_pages; + } + + i = 0; + nr = gup->nr_pages_per_call; + start_time = ktime_get(); + for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) { + if (nr != gup->nr_pages_per_call) + break; + + next = addr + nr * PAGE_SIZE; + if (next > gup->addr + gup->size) { + next = gup->addr + gup->size; + nr = (next - addr) / PAGE_SIZE; + } + + switch (cmd) { + case GUP_FAST_BENCHMARK: + nr = get_user_pages_fast(addr, nr, gup->gup_flags, + pages + i); + break; + case GUP_BASIC_TEST: + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, + NULL); + break; + case PIN_FAST_BENCHMARK: + nr = pin_user_pages_fast(addr, nr, gup->gup_flags, + pages + i); + break; + case PIN_BASIC_TEST: + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, + NULL); + break; + case PIN_LONGTERM_BENCHMARK: + nr = pin_user_pages(addr, nr, + gup->gup_flags | FOLL_LONGTERM, + pages + i, NULL); + break; + case DUMP_USER_PAGES_TEST: + if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->gup_flags, + pages + i, NULL); + else + nr = get_user_pages(addr, nr, gup->gup_flags, + pages + i, NULL); + break; + default: + ret = -EINVAL; + goto unlock; + } + + if (nr <= 0) + break; + i += nr; + } + end_time = ktime_get(); + + /* Shifting the meaning of nr_pages: now it is actual number pinned: */ + nr_pages = i; + + gup->get_delta_usec = ktime_us_delta(end_time, start_time); + gup->size = addr - gup->addr; + + /* + * Take an un-benchmark-timed moment to verify DMA pinned + * state: print a warning if any non-dma-pinned pages are found: + */ + verify_dma_pinned(cmd, pages, nr_pages); + + if (cmd == DUMP_USER_PAGES_TEST) + dump_pages_test(gup, pages, nr_pages); + + start_time = ktime_get(); + + put_back_pages(cmd, pages, nr_pages, gup->test_flags); + + end_time = ktime_get(); + gup->put_delta_usec = ktime_us_delta(end_time, start_time); + +unlock: + if (needs_mmap_lock) + mmap_read_unlock(current->mm); +free_pages: + kvfree(pages); + return ret; +} + +static long gup_test_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + struct gup_test gup; + int ret; + + switch (cmd) { + case GUP_FAST_BENCHMARK: + case PIN_FAST_BENCHMARK: + case PIN_LONGTERM_BENCHMARK: + case GUP_BASIC_TEST: + case PIN_BASIC_TEST: + case DUMP_USER_PAGES_TEST: + break; + default: + return -EINVAL; + } + + if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) + return -EFAULT; + + ret = __gup_test_ioctl(cmd, &gup); + if (ret) + return ret; + + if (copy_to_user((void __user *)arg, &gup, sizeof(gup))) + return -EFAULT; + + return 0; +} + +static const struct file_operations gup_test_fops = { + .open = nonseekable_open, + .unlocked_ioctl = gup_test_ioctl, +}; + +static int __init gup_test_init(void) +{ + debugfs_create_file_unsafe("gup_test", 0600, NULL, NULL, + &gup_test_fops); + + return 0; +} + +late_initcall(gup_test_init); diff --git a/mm/gup_test.h b/mm/gup_test.h new file mode 100644 index 000000000..887ac1d5f --- /dev/null +++ b/mm/gup_test.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __GUP_TEST_H +#define __GUP_TEST_H + +#include + +#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test) +#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test) +#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test) +#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test) +#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test) +#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test) + +#define GUP_TEST_MAX_PAGES_TO_DUMP 8 + +#define GUP_TEST_FLAG_DUMP_PAGES_USE_PIN 0x1 + +struct gup_test { + __u64 get_delta_usec; + __u64 put_delta_usec; + __u64 addr; + __u64 size; + __u32 nr_pages_per_call; + __u32 gup_flags; + __u32 test_flags; + /* + * Each non-zero entry is the number of the page (1-based: first page is + * page 1, so that zero entries mean "do nothing") from the .addr base. + */ + __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP]; +}; + +#endif /* __GUP_TEST_H */ diff --git a/mm/highmem.c b/mm/highmem.c new file mode 100644 index 000000000..db251e77f --- /dev/null +++ b/mm/highmem.c @@ -0,0 +1,814 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * High memory handling common code and variables. + * + * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de + * Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * 64-bit physical space. With current x86 CPUs this + * means up to 64 Gigabytes physical RAM. + * + * Rewrote high memory support to move the page cache into + * high memory. Implemented permanent (schedulable) kmaps + * based on Linus' idea. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_KMAP_LOCAL +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif +#endif /* CONFIG_KMAP_LOCAL */ + +/* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped + * since a TLB flush - it is usable. + * 1 means that there are no users, but it has been mapped + * since the last TLB flush - so we can't use it. + * n means that there are (n-1) current users of it. + */ +#ifdef CONFIG_HIGHMEM + +/* + * Architecture with aliasing data cache may define the following family of + * helper functions in its asm/highmem.h to control cache color of virtual + * addresses where physical memory pages are mapped by kmap. + */ +#ifndef get_pkmap_color + +/* + * Determine color of virtual address where the page should be mapped. + */ +static inline unsigned int get_pkmap_color(struct page *page) +{ + return 0; +} +#define get_pkmap_color get_pkmap_color + +/* + * Get next index for mapping inside PKMAP region for page with given color. + */ +static inline unsigned int get_next_pkmap_nr(unsigned int color) +{ + static unsigned int last_pkmap_nr; + + last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; + return last_pkmap_nr; +} + +/* + * Determine if page index inside PKMAP region (pkmap_nr) of given color + * has wrapped around PKMAP region end. When this happens an attempt to + * flush all unused PKMAP slots is made. + */ +static inline int no_more_pkmaps(unsigned int pkmap_nr, unsigned int color) +{ + return pkmap_nr == 0; +} + +/* + * Get the number of PKMAP entries of the given color. If no free slot is + * found after checking that many entries, kmap will sleep waiting for + * someone to call kunmap and free PKMAP slot. + */ +static inline int get_pkmap_entries_count(unsigned int color) +{ + return LAST_PKMAP; +} + +/* + * Get head of a wait queue for PKMAP entries of the given color. + * Wait queues for different mapping colors should be independent to avoid + * unnecessary wakeups caused by freeing of slots of other colors. + */ +static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) +{ + static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); + + return &pkmap_map_wait; +} +#endif + +atomic_long_t _totalhigh_pages __read_mostly; +EXPORT_SYMBOL(_totalhigh_pages); + +unsigned int __nr_free_highpages(void) +{ + struct zone *zone; + unsigned int pages = 0; + + for_each_populated_zone(zone) { + if (is_highmem(zone)) + pages += zone_page_state(zone, NR_FREE_PAGES); + } + + return pages; +} + +static int pkmap_count[LAST_PKMAP]; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); + +pte_t *pkmap_page_table; + +/* + * Most architectures have no use for kmap_high_get(), so let's abstract + * the disabling of IRQ out of the locking in that case to save on a + * potential useless overhead. + */ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +#define lock_kmap() spin_lock_irq(&kmap_lock) +#define unlock_kmap() spin_unlock_irq(&kmap_lock) +#define lock_kmap_any(flags) spin_lock_irqsave(&kmap_lock, flags) +#define unlock_kmap_any(flags) spin_unlock_irqrestore(&kmap_lock, flags) +#else +#define lock_kmap() spin_lock(&kmap_lock) +#define unlock_kmap() spin_unlock(&kmap_lock) +#define lock_kmap_any(flags) \ + do { spin_lock(&kmap_lock); (void)(flags); } while (0) +#define unlock_kmap_any(flags) \ + do { spin_unlock(&kmap_lock); (void)(flags); } while (0) +#endif + +struct page *__kmap_to_page(void *vaddr) +{ + unsigned long base = (unsigned long) vaddr & PAGE_MASK; + struct kmap_ctrl *kctrl = ¤t->kmap_ctrl; + unsigned long addr = (unsigned long)vaddr; + int i; + + /* kmap() mappings */ + if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && + addr < PKMAP_ADDR(LAST_PKMAP))) + return pte_page(pkmap_page_table[PKMAP_NR(addr)]); + + /* kmap_local_page() mappings */ + if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && + base < __fix_to_virt(FIX_KMAP_BEGIN))) { + for (i = 0; i < kctrl->idx; i++) { + unsigned long base_addr; + int idx; + + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + + if (base_addr == base) + return pte_page(kctrl->pteval[i]); + } + } + + return virt_to_page(vaddr); +} +EXPORT_SYMBOL(__kmap_to_page); + +static void flush_all_zero_pkmaps(void) +{ + int i; + int need_flush = 0; + + flush_cache_kmaps(); + + for (i = 0; i < LAST_PKMAP; i++) { + struct page *page; + + /* + * zero means we don't have anything to do, + * >1 means that it is still in use. Only + * a count of 1 means that it is free but + * needs to be unmapped + */ + if (pkmap_count[i] != 1) + continue; + pkmap_count[i] = 0; + + /* sanity check */ + BUG_ON(pte_none(pkmap_page_table[i])); + + /* + * Don't need an atomic fetch-and-clear op here; + * no-one has the page mapped, and cannot get at + * its virtual address (and hence PTE) without first + * getting the kmap_lock (which is held here). + * So no dangers, even with speculative execution. + */ + page = pte_page(pkmap_page_table[i]); + pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); + + set_page_address(page, NULL); + need_flush = 1; + } + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); +} + +void __kmap_flush_unused(void) +{ + lock_kmap(); + flush_all_zero_pkmaps(); + unlock_kmap(); +} + +static inline unsigned long map_new_virtual(struct page *page) +{ + unsigned long vaddr; + int count; + unsigned int last_pkmap_nr; + unsigned int color = get_pkmap_color(page); + +start: + count = get_pkmap_entries_count(color); + /* Find an empty entry */ + for (;;) { + last_pkmap_nr = get_next_pkmap_nr(color); + if (no_more_pkmaps(last_pkmap_nr, color)) { + flush_all_zero_pkmaps(); + count = get_pkmap_entries_count(color); + } + if (!pkmap_count[last_pkmap_nr]) + break; /* Found a usable entry */ + if (--count) + continue; + + /* + * Sleep for somebody else to unmap their entries + */ + { + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *pkmap_map_wait = + get_pkmap_wait_queue_head(color); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(pkmap_map_wait, &wait); + unlock_kmap(); + schedule(); + remove_wait_queue(pkmap_map_wait, &wait); + lock_kmap(); + + /* Somebody else might have mapped it while we slept */ + if (page_address(page)) + return (unsigned long)page_address(page); + + /* Re-start */ + goto start; + } + } + vaddr = PKMAP_ADDR(last_pkmap_nr); + set_pte_at(&init_mm, vaddr, + &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); + + pkmap_count[last_pkmap_nr] = 1; + set_page_address(page, (void *)vaddr); + + return vaddr; +} + +/** + * kmap_high - map a highmem page into memory + * @page: &struct page to map + * + * Returns the page's virtual memory address. + * + * We cannot call this from interrupts, as it may block. + */ +void *kmap_high(struct page *page) +{ + unsigned long vaddr; + + /* + * For highmem pages, we can't trust "virtual" until + * after we have the lock. + */ + lock_kmap(); + vaddr = (unsigned long)page_address(page); + if (!vaddr) + vaddr = map_new_virtual(page); + pkmap_count[PKMAP_NR(vaddr)]++; + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); + unlock_kmap(); + return (void *) vaddr; +} +EXPORT_SYMBOL(kmap_high); + +#ifdef ARCH_NEEDS_KMAP_HIGH_GET +/** + * kmap_high_get - pin a highmem page into memory + * @page: &struct page to pin + * + * Returns the page's current virtual memory address, or NULL if no mapping + * exists. If and only if a non null address is returned then a + * matching call to kunmap_high() is necessary. + * + * This can be called from any context. + */ +void *kmap_high_get(struct page *page) +{ + unsigned long vaddr, flags; + + lock_kmap_any(flags); + vaddr = (unsigned long)page_address(page); + if (vaddr) { + BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 1); + pkmap_count[PKMAP_NR(vaddr)]++; + } + unlock_kmap_any(flags); + return (void *) vaddr; +} +#endif + +/** + * kunmap_high - unmap a highmem page into memory + * @page: &struct page to unmap + * + * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called + * only from user context. + */ +void kunmap_high(struct page *page) +{ + unsigned long vaddr; + unsigned long nr; + unsigned long flags; + int need_wakeup; + unsigned int color = get_pkmap_color(page); + wait_queue_head_t *pkmap_map_wait; + + lock_kmap_any(flags); + vaddr = (unsigned long)page_address(page); + BUG_ON(!vaddr); + nr = PKMAP_NR(vaddr); + + /* + * A count must never go down to zero + * without a TLB flush! + */ + need_wakeup = 0; + switch (--pkmap_count[nr]) { + case 0: + BUG(); + case 1: + /* + * Avoid an unnecessary wake_up() function call. + * The common case is pkmap_count[] == 1, but + * no waiters. + * The tasks queued in the wait-queue are guarded + * by both the lock in the wait-queue-head and by + * the kmap_lock. As the kmap_lock is held here, + * no need for the wait-queue-head's lock. Simply + * test if the queue is empty. + */ + pkmap_map_wait = get_pkmap_wait_queue_head(color); + need_wakeup = waitqueue_active(pkmap_map_wait); + } + unlock_kmap_any(flags); + + /* do wake-up, if needed, race-free outside of the spin lock */ + if (need_wakeup) + wake_up(pkmap_map_wait); +} +EXPORT_SYMBOL(kunmap_high); + +void zero_user_segments(struct page *page, unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +{ + unsigned int i; + + BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + + if (start1 >= end1) + start1 = end1 = 0; + if (start2 >= end2) + start2 = end2 = 0; + + for (i = 0; i < compound_nr(page); i++) { + void *kaddr = NULL; + + if (start1 >= PAGE_SIZE) { + start1 -= PAGE_SIZE; + end1 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); + + if (end1 > start1) { + kaddr = kmap_local_page(page + i); + memset(kaddr + start1, 0, this_end - start1); + } + end1 -= this_end; + start1 = 0; + } + + if (start2 >= PAGE_SIZE) { + start2 -= PAGE_SIZE; + end2 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); + + if (end2 > start2) { + if (!kaddr) + kaddr = kmap_local_page(page + i); + memset(kaddr + start2, 0, this_end - start2); + } + end2 -= this_end; + start2 = 0; + } + + if (kaddr) { + kunmap_local(kaddr); + flush_dcache_page(page + i); + } + + if (!end1 && !end2) + break; + } + + BUG_ON((start1 | start2 | end1 | end2) != 0); +} +EXPORT_SYMBOL(zero_user_segments); +#endif /* CONFIG_HIGHMEM */ + +#ifdef CONFIG_KMAP_LOCAL + +#include + +/* + * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second + * slot is unused which acts as a guard page + */ +#ifdef CONFIG_DEBUG_KMAP_LOCAL +# define KM_INCR 2 +#else +# define KM_INCR 1 +#endif + +static inline int kmap_local_idx_push(void) +{ + WARN_ON_ONCE(in_hardirq() && !irqs_disabled()); + current->kmap_ctrl.idx += KM_INCR; + BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); + return current->kmap_ctrl.idx - 1; +} + +static inline int kmap_local_idx(void) +{ + return current->kmap_ctrl.idx - 1; +} + +static inline void kmap_local_idx_pop(void) +{ + current->kmap_ctrl.idx -= KM_INCR; + BUG_ON(current->kmap_ctrl.idx < 0); +} + +#ifndef arch_kmap_local_post_map +# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) +#endif + +#ifndef arch_kmap_local_pre_unmap +# define arch_kmap_local_pre_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_post_unmap +# define arch_kmap_local_post_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_unmap_idx +#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_high_get +static inline void *arch_kmap_local_high_get(struct page *page) +{ + return NULL; +} +#endif + +#ifndef arch_kmap_local_set_pte +#define arch_kmap_local_set_pte(mm, vaddr, ptep, ptev) \ + set_pte_at(mm, vaddr, ptep, ptev) +#endif + +/* Unmap a local mapping which was obtained by kmap_high_get() */ +static inline bool kmap_high_unmap_local(unsigned long vaddr) +{ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + return true; + } +#endif + return false; +} + +static pte_t *__kmap_pte; + +static pte_t *kmap_get_pte(unsigned long vaddr, int idx) +{ + if (IS_ENABLED(CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY)) + /* + * Set by the arch if __kmap_pte[-idx] does not produce + * the correct entry. + */ + return virt_to_kpte(vaddr); + if (!__kmap_pte) + __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + return &__kmap_pte[-idx]; +} + +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + pte_t pteval, *kmap_pte; + unsigned long vaddr; + int idx; + + /* + * Disable migration so resulting virtual address is stable + * across preemption. + */ + migrate_disable(); + preempt_disable(); + idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + kmap_pte = kmap_get_pte(vaddr, idx); + BUG_ON(!pte_none(*kmap_pte)); + pteval = pfn_pte(pfn, prot); + arch_kmap_local_set_pte(&init_mm, vaddr, kmap_pte, pteval); + arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; + preempt_enable(); + + return (void *)vaddr; +} +EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); + +void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + void *kmap; + + /* + * To broaden the usage of the actual kmap_local() machinery always map + * pages when debugging is enabled and the architecture has no problems + * with alias mappings. + */ + if (!IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) && !PageHighMem(page)) + return page_address(page); + + /* Try kmap_high_get() if architecture has it enabled */ + kmap = arch_kmap_local_high_get(page); + if (kmap) + return kmap; + + return __kmap_local_pfn_prot(page_to_pfn(page), prot); +} +EXPORT_SYMBOL(__kmap_local_page_prot); + +void kunmap_local_indexed(const void *vaddr) +{ + unsigned long addr = (unsigned long) vaddr & PAGE_MASK; + pte_t *kmap_pte; + int idx; + + if (addr < __fix_to_virt(FIX_KMAP_END) || + addr > __fix_to_virt(FIX_KMAP_BEGIN)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP)) { + /* This _should_ never happen! See above. */ + WARN_ON_ONCE(1); + return; + } + /* + * Handle mappings which were obtained by kmap_high_get() + * first as the virtual address of such mappings is below + * PAGE_OFFSET. Warn for all other addresses which are in + * the user space part of the virtual address space. + */ + if (!kmap_high_unmap_local(addr)) + WARN_ON_ONCE(addr < PAGE_OFFSET); + return; + } + + preempt_disable(); + idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); + WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + kmap_pte = kmap_get_pte(addr, idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte); + arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); + kmap_local_idx_pop(); + preempt_enable(); + migrate_enable(); +} +EXPORT_SYMBOL(kunmap_local_indexed); + +/* + * Invoked before switch_to(). This is safe even when during or after + * clearing the maps an interrupt which needs a kmap_local happens because + * the task::kmap_ctrl.idx is not modified by the unmapping code so a + * nested kmap_local will use the next unused index and restore the index + * on unmap. The already cleared kmaps of the outgoing task are irrelevant + * because the interrupt context does not know about them. The same applies + * when scheduling back in for an interrupt which happens before the + * restore is complete. + */ +void __kmap_local_sched_out(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte; + int i; + + /* Clear kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { + WARN_ON_ONCE(pte_val(pteval) != 0); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* + * This is a horrible hack for XTENSA to calculate the + * coloured PTE index. Uses the PFN encoded into the pteval + * and the map index calculation because the actual mapped + * virtual address is not stored in task::kmap_ctrl. + * For any sane architecture this is optimized out. + */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + kmap_pte = kmap_get_pte(addr, idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte); + arch_kmap_local_post_unmap(addr); + } +} + +void __kmap_local_sched_in(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte; + int i; + + /* Restore kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) { + WARN_ON_ONCE(pte_val(pteval) != 0); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + kmap_pte = kmap_get_pte(addr, idx); + set_pte_at(&init_mm, addr, kmap_pte, pteval); + arch_kmap_local_post_map(addr, pteval); + } +} + +void kmap_local_fork(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) + memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); +} + +#endif + +#if defined(HASHED_PAGE_VIRTUAL) + +#define PA_HASH_ORDER 7 + +/* + * Describes one page->virtual association + */ +struct page_address_map { + struct page *page; + void *virtual; + struct list_head list; +}; + +static struct page_address_map page_address_maps[LAST_PKMAP]; + +/* + * Hash table bucket + */ +static struct page_address_slot { + struct list_head lh; /* List of page_address_maps */ + spinlock_t lock; /* Protect this bucket's list */ +} ____cacheline_aligned_in_smp page_address_htable[1<lock, flags); + if (!list_empty(&pas->lh)) { + struct page_address_map *pam; + + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + ret = pam->virtual; + break; + } + } + } + + spin_unlock_irqrestore(&pas->lock, flags); + return ret; +} +EXPORT_SYMBOL(page_address); + +/** + * set_page_address - set a page's virtual address + * @page: &struct page to set + * @virtual: virtual address to use + */ +void set_page_address(struct page *page, void *virtual) +{ + unsigned long flags; + struct page_address_slot *pas; + struct page_address_map *pam; + + BUG_ON(!PageHighMem(page)); + + pas = page_slot(page); + if (virtual) { /* Add */ + pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; + pam->page = page; + pam->virtual = virtual; + + spin_lock_irqsave(&pas->lock, flags); + list_add_tail(&pam->list, &pas->lh); + spin_unlock_irqrestore(&pas->lock, flags); + } else { /* Remove */ + spin_lock_irqsave(&pas->lock, flags); + list_for_each_entry(pam, &pas->lh, list) { + if (pam->page == page) { + list_del(&pam->list); + break; + } + } + spin_unlock_irqrestore(&pas->lock, flags); + } + + return; +} + +void __init page_address_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { + INIT_LIST_HEAD(&page_address_htable[i].lh); + spin_lock_init(&page_address_htable[i].lock); + } +} + +#endif /* defined(HASHED_PAGE_VIRTUAL) */ diff --git a/mm/hmm.c b/mm/hmm.c new file mode 100644 index 000000000..3850fb625 --- /dev/null +++ b/mm/hmm.c @@ -0,0 +1,599 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2013 Red Hat Inc. + * + * Authors: Jérôme Glisse + */ +/* + * Refer to include/linux/hmm.h for information about heterogeneous memory + * management or HMM for short. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +struct hmm_vma_walk { + struct hmm_range *range; + unsigned long last; +}; + +enum { + HMM_NEED_FAULT = 1 << 0, + HMM_NEED_WRITE_FAULT = 1 << 1, + HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, +}; + +static int hmm_pfns_fill(unsigned long addr, unsigned long end, + struct hmm_range *range, unsigned long cpu_flags) +{ + unsigned long i = (addr - range->start) >> PAGE_SHIFT; + + for (; addr < end; addr += PAGE_SIZE, i++) + range->hmm_pfns[i] = cpu_flags; + return 0; +} + +/* + * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) + * @addr: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @required_fault: HMM_NEED_* flags + * @walk: mm_walk structure + * Return: -EBUSY after page fault, or page fault error + * + * This function will be called whenever pmd_none() or pte_none() returns true, + * or whenever there is no page directory covering the virtual address range. + */ +static int hmm_vma_fault(unsigned long addr, unsigned long end, + unsigned int required_fault, struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + + WARN_ON_ONCE(!required_fault); + hmm_vma_walk->last = addr; + + if (required_fault & HMM_NEED_WRITE_FAULT) { + if (!(vma->vm_flags & VM_WRITE)) + return -EPERM; + fault_flags |= FAULT_FLAG_WRITE; + } + + for (; addr < end; addr += PAGE_SIZE) + if (handle_mm_fault(vma, addr, fault_flags, NULL) & + VM_FAULT_ERROR) + return -EFAULT; + return -EBUSY; +} + +static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + unsigned long pfn_req_flags, + unsigned long cpu_flags) +{ + struct hmm_range *range = hmm_vma_walk->range; + + /* + * So we not only consider the individual per page request we also + * consider the default flags requested for the range. The API can + * be used 2 ways. The first one where the HMM user coalesces + * multiple page faults into one request and sets flags per pfn for + * those faults. The second one where the HMM user wants to pre- + * fault a range with specific flags. For the latter one it is a + * waste to have the user pre-fill the pfn arrays with a default + * flags value. + */ + pfn_req_flags &= range->pfn_flags_mask; + pfn_req_flags |= range->default_flags; + + /* We aren't ask to do anything ... */ + if (!(pfn_req_flags & HMM_PFN_REQ_FAULT)) + return 0; + + /* Need to write fault ? */ + if ((pfn_req_flags & HMM_PFN_REQ_WRITE) && + !(cpu_flags & HMM_PFN_WRITE)) + return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; + + /* If CPU page table is not valid then we need to fault */ + if (!(cpu_flags & HMM_PFN_VALID)) + return HMM_NEED_FAULT; + return 0; +} + +static unsigned int +hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const unsigned long hmm_pfns[], unsigned long npages, + unsigned long cpu_flags) +{ + struct hmm_range *range = hmm_vma_walk->range; + unsigned int required_fault = 0; + unsigned long i; + + /* + * If the default flags do not request to fault pages, and the mask does + * not allow for individual pages to be faulted, then + * hmm_pte_need_fault() will always return 0. + */ + if (!((range->default_flags | range->pfn_flags_mask) & + HMM_PFN_REQ_FAULT)) + return 0; + + for (i = 0; i < npages; ++i) { + required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i], + cpu_flags); + if (required_fault == HMM_NEED_ALL_BITS) + return required_fault; + } + return required_fault; +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + __always_unused int depth, struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned int required_fault; + unsigned long i, npages; + unsigned long *hmm_pfns; + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + hmm_pfns = &range->hmm_pfns[i]; + required_fault = + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0); + if (!walk->vma) { + if (required_fault) + return -EFAULT; + return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); + } + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); + return hmm_pfns_fill(addr, end, range, 0); +} + +static inline unsigned long hmm_pfn_flags_order(unsigned long order) +{ + return order << HMM_PFN_ORDER_SHIFT; +} + +static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, + pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : + HMM_PFN_VALID) | + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, + unsigned long end, unsigned long hmm_pfns[], + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long pfn, npages, i; + unsigned int required_fault; + unsigned long cpu_flags; + + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); + required_fault = + hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags); + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); + + pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + hmm_pfns[i] = pfn | cpu_flags; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +/* stub to allow the code below to compile */ +int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, + unsigned long end, unsigned long hmm_pfns[], pmd_t pmd); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range, + pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) + return 0; + return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; +} + +static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, + unsigned long end, pmd_t *pmdp, pte_t *ptep, + unsigned long *hmm_pfn) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned int required_fault; + unsigned long cpu_flags; + pte_t pte = *ptep; + uint64_t pfn_req_flags = *hmm_pfn; + + if (pte_none_mostly(pte)) { + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); + if (required_fault) + goto fault; + *hmm_pfn = 0; + return 0; + } + + if (!pte_present(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + /* + * Don't fault in device private pages owned by the caller, + * just report the PFN. + */ + if (is_device_private_entry(entry) && + pfn_swap_entry_to_page(entry)->pgmap->owner == + range->dev_private_owner) { + cpu_flags = HMM_PFN_VALID; + if (is_writable_device_private_entry(entry)) + cpu_flags |= HMM_PFN_WRITE; + *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; + return 0; + } + + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); + if (!required_fault) { + *hmm_pfn = 0; + return 0; + } + + if (!non_swap_entry(entry)) + goto fault; + + if (is_device_private_entry(entry)) + goto fault; + + if (is_device_exclusive_entry(entry)) + goto fault; + + if (is_migration_entry(entry)) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(walk->mm, pmdp, addr); + return -EBUSY; + } + + /* Report error for everything else */ + pte_unmap(ptep); + return -EFAULT; + } + + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); + if (required_fault) + goto fault; + + /* + * Bypass devmap pte such as DAX page when all pfn requested + * flags(pfn_req_flags) are fulfilled. + * Since each architecture defines a struct page for the zero page, just + * fall through and treat it like a normal page. + */ + if (!vm_normal_page(walk->vma, addr, pte) && + !pte_devmap(pte) && + !is_zero_pfn(pte_pfn(pte))) { + if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { + pte_unmap(ptep); + return -EFAULT; + } + *hmm_pfn = HMM_PFN_ERROR; + return 0; + } + + *hmm_pfn = pte_pfn(pte) | cpu_flags; + return 0; + +fault: + pte_unmap(ptep); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_fault(addr, end, required_fault, walk); +} + +static int hmm_vma_walk_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long *hmm_pfns = + &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; + pte_t *ptep; + pmd_t pmd; + +again: + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) + return hmm_vma_walk_hole(start, end, -1, walk); + + if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) { + hmm_vma_walk->last = addr; + pmd_migration_entry_wait(walk->mm, pmdp); + return -EBUSY; + } + return hmm_pfns_fill(start, end, range, 0); + } + + if (!pmd_present(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) + return -EFAULT; + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } + + if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { + /* + * No need to take pmd_lock here, even if some other thread + * is splitting the huge pmd we will get that event through + * mmu_notifier callback. + * + * So just read pmd value and check again it's a transparent + * huge or device mapping one and compute corresponding pfn + * values. + */ + pmd = pmd_read_atomic(pmdp); + barrier(); + if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + goto again; + + return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd); + } + + /* + * We have handled all the valid cases above ie either none, migration, + * huge or transparent huge. At this point either it is a valid pmd + * entry pointing to pte directory or it is a bad pmd that will not + * recover. + */ + if (pmd_bad(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) + return -EFAULT; + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } + + ptep = pte_offset_map(pmdp, addr); + for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) { + int r; + + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns); + if (r) { + /* hmm_vma_handle_pte() did pte_unmap() */ + return r; + } + } + pte_unmap(ptep - 1); + return 0; +} + +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, + pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : + HMM_PFN_VALID) | + hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT); +} + +static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long addr = start; + pud_t pud; + spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma); + + if (!ptl) + return 0; + + /* Normally we don't want to split the huge page */ + walk->action = ACTION_CONTINUE; + + pud = READ_ONCE(*pudp); + if (pud_none(pud)) { + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); + } + + if (pud_huge(pud) && pud_devmap(pud)) { + unsigned long i, npages, pfn; + unsigned int required_fault; + unsigned long *hmm_pfns; + unsigned long cpu_flags; + + if (!pud_present(pud)) { + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); + } + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + hmm_pfns = &range->hmm_pfns[i]; + + cpu_flags = pud_to_hmm_pfn_flags(range, pud); + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, + npages, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); + } + + pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + for (i = 0; i < npages; ++i, ++pfn) + hmm_pfns[i] = pfn | cpu_flags; + goto out_unlock; + } + + /* Ask for the PUD to be split */ + walk->action = ACTION_SUBTREE; + +out_unlock: + spin_unlock(ptl); + return 0; +} +#else +#define hmm_vma_walk_pud NULL +#endif + +#ifdef CONFIG_HUGETLB_PAGE +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long addr = start, i, pfn; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + unsigned int required_fault; + unsigned long pfn_req_flags; + unsigned long cpu_flags; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); + entry = huge_ptep_get(pte); + + i = (start - range->start) >> PAGE_SHIFT; + pfn_req_flags = range->hmm_pfns[i]; + cpu_flags = pte_to_hmm_pfn_flags(range, entry) | + hmm_pfn_flags_order(huge_page_order(hstate_vma(vma))); + required_fault = + hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); + } + + pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) + range->hmm_pfns[i] = pfn | cpu_flags; + + spin_unlock(ptl); + return 0; +} +#else +#define hmm_vma_walk_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + +static int hmm_vma_walk_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && + vma->vm_flags & VM_READ) + return 0; + + /* + * vma ranges that don't have struct page backing them or map I/O + * devices directly cannot be handled by hmm_range_fault(). + * + * If the vma does not allow read access, then assume that it does not + * allow write access either. HMM does not support architectures that + * allow write without read. + * + * If a fault is requested for an unsupported range then it is a hard + * failure. + */ + if (hmm_range_need_fault(hmm_vma_walk, + range->hmm_pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, 0)) + return -EFAULT; + + hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + + /* Skip this vma and continue processing the next vma. */ + return 1; +} + +static const struct mm_walk_ops hmm_walk_ops = { + .pud_entry = hmm_vma_walk_pud, + .pmd_entry = hmm_vma_walk_pmd, + .pte_hole = hmm_vma_walk_hole, + .hugetlb_entry = hmm_vma_walk_hugetlb_entry, + .test_walk = hmm_vma_walk_test, +}; + +/** + * hmm_range_fault - try to fault some address in a virtual address range + * @range: argument structure + * + * Returns 0 on success or one of the following error codes: + * + * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma + * (e.g., device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (e.g., asking for write and range is read + * only). + * -EBUSY: The range has been invalidated and the caller needs to wait for + * the invalidation to finish. + * -EFAULT: A page was requested to be valid and could not be made valid + * ie it has no backing VMA or it is illegal to access + * + * This is similar to get_user_pages(), except that it can read the page tables + * without mutating them (ie causing faults). + */ +int hmm_range_fault(struct hmm_range *range) +{ + struct hmm_vma_walk hmm_vma_walk = { + .range = range, + .last = range->start, + }; + struct mm_struct *mm = range->notifier->mm; + int ret; + + mmap_assert_locked(mm); + + do { + /* If range is no longer valid force retry. */ + if (mmu_interval_check_retry(range->notifier, + range->notifier_seq)) + return -EBUSY; + ret = walk_page_range(mm, hmm_vma_walk.last, range->end, + &hmm_walk_ops, &hmm_vma_walk); + /* + * When -EBUSY is returned the loop restarts with + * hmm_vma_walk.last set to an address that has not been stored + * in pfns. All entries < last in the pfn array are set to their + * output, and all >= are still at their input values. + */ + } while (ret == -EBUSY); + return ret; +} +EXPORT_SYMBOL(hmm_range_fault); diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000..595779467 --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,3302 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2009 Red Hat, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "internal.h" +#include "swap.h" + +#define CREATE_TRACE_POINTS +#include + +/* + * By default, transparent hugepage support is disabled in order to avoid + * risking an increased memory footprint for applications that are not + * guaranteed to benefit from it. When transparent hugepage support is + * enabled, it is for all mappings, and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. + */ +unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1<vm_mm) /* vdso */ + return false; + + /* + * Explicitly disabled through madvise or prctl, or some + * architectures may disable THP for some mappings, for + * example, s390 kvm. + * */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + return false; + + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ + if (vma_is_dax(vma)) + return in_pf; + + /* + * Special VMA and hugetlb VMA. + * Must be checked after dax since some dax mappings may have + * VM_MIXEDMAP set. + */ + if (vm_flags & VM_NO_KHUGEPAGED) + return false; + + /* + * Check alignment for file vma and size for both file and anon vma. + * + * Skip the check for page fault. Huge fault does the check in fault + * handlers. And this check is not suitable for huge PUD fault. + */ + if (!in_pf && + !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) + return false; + + /* + * Enabled via shmem mount options or sysfs settings. + * Must be done before hugepage flags check since shmem has its + * own flags. + */ + if (!in_pf && shmem_file(vma->vm_file)) + return shmem_huge_enabled(vma, !enforce_sysfs); + + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) + return false; + + /* Only regular file is valid */ + if (!in_pf && file_thp_enabled(vma)) + return true; + + if (!vma_is_anonymous(vma)) + return false; + + if (vma_is_temporary_stack(vma)) + return false; + + /* + * THPeligible bit of smaps should show 1 for proper VMAs even + * though anon_vma is not initialized yet. + * + * Allow page fault since anon_vma may be not initialized until + * the first page fault. + */ + if (!vma->anon_vma) + return (smaps || in_pf); + + return true; +} + +static bool get_huge_zero_page(void) +{ + struct page *zero_page; +retry: + if (likely(atomic_inc_not_zero(&huge_zero_refcount))) + return true; + + zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, + HPAGE_PMD_ORDER); + if (!zero_page) { + count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); + return false; + } + preempt_disable(); + if (cmpxchg(&huge_zero_page, NULL, zero_page)) { + preempt_enable(); + __free_pages(zero_page, compound_order(zero_page)); + goto retry; + } + WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); + + /* We take additional reference here. It will be put back by shrinker */ + atomic_set(&huge_zero_refcount, 2); + preempt_enable(); + count_vm_event(THP_ZERO_PAGE_ALLOC); + return true; +} + +static void put_huge_zero_page(void) +{ + /* + * Counter should never go to zero here. Only shrinker can put + * last reference. + */ + BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); +} + +struct page *mm_get_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + return READ_ONCE(huge_zero_page); + + if (!get_huge_zero_page()) + return NULL; + + if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); + + return READ_ONCE(huge_zero_page); +} + +void mm_put_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); +} + +static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + /* we can free zero page only if last reference remains */ + return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; +} + +static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { + struct page *zero_page = xchg(&huge_zero_page, NULL); + BUG_ON(zero_page == NULL); + WRITE_ONCE(huge_zero_pfn, ~0UL); + __free_pages(zero_page, compound_order(zero_page)); + return HPAGE_PMD_NR; + } + + return 0; +} + +static struct shrinker huge_zero_page_shrinker = { + .count_objects = shrink_huge_zero_page_count, + .scan_objects = shrink_huge_zero_page_scan, + .seeks = DEFAULT_SEEKS, +}; + +#ifdef CONFIG_SYSFS +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + const char *output; + + if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) + output = "[always] madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always [madvise] never"; + else + output = "always madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret = count; + + if (sysfs_streq(buf, "always")) { + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + } else if (sysfs_streq(buf, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (sysfs_streq(buf, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + ret = -EINVAL; + + if (ret > 0) { + int err = start_stop_khugepaged(); + if (err) + ret = err; + } + return ret; +} + +static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); + +ssize_t single_hugepage_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) +{ + return sysfs_emit(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); +} + +ssize_t single_hugepage_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag) +{ + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) + set_bit(flag, &transparent_hugepage_flags); + else + clear_bit(flag, &transparent_hugepage_flags); + + return count; +} + +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + const char *output; + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + &transparent_hugepage_flags)) + output = "[always] defer defer+madvise madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + &transparent_hugepage_flags)) + output = "always [defer] defer+madvise madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always defer [defer+madvise] madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always defer defer+madvise [madvise] never"; + else + output = "always defer defer+madvise madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); +} + +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (sysfs_streq(buf, "always")) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + } else if (sysfs_streq(buf, "defer+madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + } else if (sysfs_streq(buf, "defer")) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + } else if (sysfs_streq(buf, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else if (sysfs_streq(buf, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} +static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); + +static ssize_t use_zero_page_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_hugepage_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static ssize_t use_zero_page_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + return single_hugepage_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); +} +static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); + +static ssize_t hpage_pmd_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); +} +static struct kobj_attribute hpage_pmd_size_attr = + __ATTR_RO(hpage_pmd_size); + +static struct attribute *hugepage_attr[] = { + &enabled_attr.attr, + &defrag_attr.attr, + &use_zero_page_attr.attr, + &hpage_pmd_size_attr.attr, +#ifdef CONFIG_SHMEM + &shmem_enabled_attr.attr, +#endif + NULL, +}; + +static const struct attribute_group hugepage_attr_group = { + .attrs = hugepage_attr, +}; + +static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + int err; + + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!*hugepage_kobj)) { + pr_err("failed to create transparent hugepage kobject\n"); + return -ENOMEM; + } + + err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); + if (err) { + pr_err("failed to register transparent hugepage group\n"); + goto delete_obj; + } + + err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); + if (err) { + pr_err("failed to register transparent hugepage group\n"); + goto remove_hp_group; + } + + return 0; + +remove_hp_group: + sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group); +delete_obj: + kobject_put(*hugepage_kobj); + return err; +} + +static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ + sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group); + sysfs_remove_group(hugepage_kobj, &hugepage_attr_group); + kobject_put(hugepage_kobj); +} +#else +static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj) +{ + return 0; +} + +static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) +{ +} +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; + struct kobject *hugepage_kobj; + + if (!has_transparent_hugepage()) { + /* + * Hardware doesn't support hugepages, hence disable + * DAX PMD support. + */ + transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX; + return -EINVAL; + } + + /* + * hugepages can't be allocated by the buddy allocator + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); + /* + * we use page->mapping and page->index in second tail page + * as list_head: assuming THP order >= 2 + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + + err = hugepage_init_sysfs(&hugepage_kobj); + if (err) + goto err_sysfs; + + err = khugepaged_init(); + if (err) + goto err_slab; + + err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); + if (err) + goto err_hzp_shrinker; + err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); + if (err) + goto err_split_shrinker; + + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { + transparent_hugepage_flags = 0; + return 0; + } + + err = start_stop_khugepaged(); + if (err) + goto err_khugepaged; + + return 0; +err_khugepaged: + unregister_shrinker(&deferred_split_shrinker); +err_split_shrinker: + unregister_shrinker(&huge_zero_page_shrinker); +err_hzp_shrinker: + khugepaged_destroy(); +err_slab: + hugepage_exit_sysfs(hugepage_kobj); +err_sysfs: + return err; +} +subsys_initcall(hugepage_init); + +static int __init setup_transparent_hugepage(char *str) +{ + int ret = 0; + if (!str) + goto out; + if (!strcmp(str, "always")) { + set_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } +out: + if (!ret) + pr_warn("transparent_hugepage= cannot parse, ignored\n"); + return ret; +} +__setup("transparent_hugepage=", setup_transparent_hugepage); + +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pmd = pmd_mkwrite(pmd); + return pmd; +} + +#ifdef CONFIG_MEMCG +static inline struct deferred_split *get_deferred_split_queue(struct page *page) +{ + struct mem_cgroup *memcg = page_memcg(compound_head(page)); + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + if (memcg) + return &memcg->deferred_split_queue; + else + return &pgdat->deferred_split_queue; +} +#else +static inline struct deferred_split *get_deferred_split_queue(struct page *page) +{ + struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); + + return &pgdat->deferred_split_queue; +} +#endif + +void prep_transhuge_page(struct page *page) +{ + /* + * we use page->mapping and page->index in second tail page + * as list_head: assuming THP order >= 2 + */ + + INIT_LIST_HEAD(page_deferred_list(page)); + set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); +} + +static inline bool is_transparent_hugepage(struct page *page) +{ + if (!PageCompound(page)) + return false; + + page = compound_head(page); + return is_huge_zero_page(page) || + page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; +} + +static unsigned long __thp_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, + loff_t off, unsigned long flags, unsigned long size) +{ + loff_t off_end = off + len; + loff_t off_align = round_up(off, size); + unsigned long len_pad, ret; + + if (off_end <= off_align || (off_end - off_align) < size) + return 0; + + len_pad = len + size; + if (len_pad < len || (off + len_pad) < off) + return 0; + + ret = current->mm->get_unmapped_area(filp, addr, len_pad, + off >> PAGE_SHIFT, flags); + + /* + * The failure might be due to length padding. The caller will retry + * without the padding. + */ + if (IS_ERR_VALUE(ret)) + return 0; + + /* + * Do not try to align to THP boundary if allocation at the address + * hint succeeds. + */ + if (ret == addr) + return addr; + + ret += (off - ret) & (size - 1); + return ret; +} + +unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + unsigned long ret; + loff_t off = (loff_t)pgoff << PAGE_SHIFT; + + ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); + if (ret) + return ret; + + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} +EXPORT_SYMBOL_GPL(thp_get_unmapped_area); + +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, + struct page *page, gfp_t gfp) +{ + struct vm_area_struct *vma = vmf->vma; + pgtable_t pgtable; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + vm_fault_t ret = 0; + + VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); + return VM_FAULT_FALLBACK; + } + cgroup_throttle_swaprate(page, gfp); + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } + + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ + __SetPageUptodate(page); + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + goto unlock_release; + } else { + pmd_t entry; + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock_release; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + spin_unlock(vmf->ptl); + put_page(page); + pte_free(vma->vm_mm, pgtable); + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + + entry = mk_huge_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + page_add_new_anon_rmap(page, vma, haddr); + lru_cache_add_inactive_or_unevictable(page, vma); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(vma->vm_mm); + spin_unlock(vmf->ptl); + count_vm_event(THP_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); + } + + return 0; +unlock_release: + spin_unlock(vmf->ptl); +release: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + put_page(page); + return ret; + +} + +/* + * always: directly stall for all thp allocations + * defer: wake kswapd and fail if not immediately available + * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise + * fail if not immediately available + * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately + * available + * never: never stall for any thp allocation + */ +gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma) +{ + const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE); + + /* Always do synchronous compaction */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); + + /* Kick kcompactd and fail quickly */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; + + /* Synchronous compaction if madvised, otherwise kick kcompactd */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : + __GFP_KSWAPD_RECLAIM); + + /* Only do synchronous compaction if madvised */ + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return GFP_TRANSHUGE_LIGHT | + (vma_madvised ? __GFP_DIRECT_RECLAIM : 0); + + return GFP_TRANSHUGE_LIGHT; +} + +/* Caller must hold page table lock. */ +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, + struct page *zero_page) +{ + pmd_t entry; + if (!pmd_none(*pmd)) + return; + entry = mk_pmd(zero_page, vma->vm_page_prot); + entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, haddr, pmd, entry); + mm_inc_nr_ptes(mm); +} + +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + gfp_t gfp; + struct folio *folio; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + + if (!transhuge_vma_suitable(vma, haddr)) + return VM_FAULT_FALLBACK; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + khugepaged_enter_vma(vma, vma->vm_flags); + + if (!(vmf->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm) && + transparent_hugepage_use_zero_page()) { + pgtable_t pgtable; + struct page *zero_page; + vm_fault_t ret; + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) + return VM_FAULT_OOM; + zero_page = mm_get_huge_zero_page(vma->vm_mm); + if (unlikely(!zero_page)) { + pte_free(vma->vm_mm, pgtable); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + ret = 0; + if (pmd_none(*vmf->pmd)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) { + spin_unlock(vmf->ptl); + pte_free(vma->vm_mm, pgtable); + } else if (userfaultfd_missing(vma)) { + spin_unlock(vmf->ptl); + pte_free(vma->vm_mm, pgtable); + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, vma->vm_mm, vma, + haddr, vmf->pmd, zero_page); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); + } + } else { + spin_unlock(vmf->ptl); + pte_free(vma->vm_mm, pgtable); + } + return ret; + } + gfp = vma_thp_gfp_mask(vma); + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } + return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); +} + +static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, + pgtable_t pgtable) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t entry; + spinlock_t *ptl; + + ptl = pmd_lock(mm, pmd); + if (!pmd_none(*pmd)) { + if (write) { + if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); + goto out_unlock; + } + entry = pmd_mkyoung(*pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + + goto out_unlock; + } + + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pmd_mkdevmap(entry); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + + if (pgtable) { + pgtable_trans_huge_deposit(mm, pmd, pgtable); + mm_inc_nr_ptes(mm); + pgtable = NULL; + } + + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); + +out_unlock: + spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); +} + +/** + * vmf_insert_pfn_pmd_prot - insert a pmd size pfn + * @vmf: Structure describing the fault + * @pfn: pfn to insert + * @pgprot: page protection to use + * @write: whether it's a write fault + * + * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and + * also consult the vmf_insert_mixed_prot() documentation when + * @pgprot != @vmf->vma->vm_page_prot. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, + pgprot_t pgprot, bool write) +{ + unsigned long addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + pgtable_t pgtable = NULL; + + /* + * If we had pmd_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + !pfn_t_devmap(pfn)); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + + track_pfn_insert(vma, &pgprot, pfn); + + insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot); + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pud = pud_mkwrite(pud); + return pud; +} + +static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, pfn_t pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pud_t entry; + spinlock_t *ptl; + + ptl = pud_lock(mm, pud); + if (!pud_none(*pud)) { + if (write) { + if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + goto out_unlock; + } + entry = pud_mkyoung(*pud); + entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); + if (pudp_set_access_flags(vma, addr, pud, entry, 1)) + update_mmu_cache_pud(vma, addr, pud); + } + goto out_unlock; + } + + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pud_mkdevmap(entry); + if (write) { + entry = pud_mkyoung(pud_mkdirty(entry)); + entry = maybe_pud_mkwrite(entry, vma); + } + set_pud_at(mm, addr, pud, entry); + update_mmu_cache_pud(vma, addr, pud); + +out_unlock: + spin_unlock(ptl); +} + +/** + * vmf_insert_pfn_pud_prot - insert a pud size pfn + * @vmf: Structure describing the fault + * @pfn: pfn to insert + * @pgprot: page protection to use + * @write: whether it's a write fault + * + * Insert a pud size pfn. See vmf_insert_pfn() for additional info and + * also consult the vmf_insert_mixed_prot() documentation when + * @pgprot != @vmf->vma->vm_page_prot. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, + pgprot_t pgprot, bool write) +{ + unsigned long addr = vmf->address & PUD_MASK; + struct vm_area_struct *vma = vmf->vma; + + /* + * If we had pud_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + !pfn_t_devmap(pfn)); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + + insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, bool write) +{ + pmd_t _pmd; + + _pmd = pmd_mkyoung(*pmd); + if (write) + _pmd = pmd_mkdirty(_pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, write)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags, struct dev_pagemap **pgmap) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return NULL; + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & (FOLL_GET | FOLL_PIN))) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + if (!try_grab_page(page, flags)) + page = ERR_PTR(-ENOMEM); + + return page; +} + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) +{ + spinlock_t *dst_ptl, *src_ptl; + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable = NULL; + int ret = -ENOMEM; + + /* Skip if can be re-fill on fault */ + if (!vma_is_anonymous(dst_vma)) + return 0; + + pgtable = pte_alloc_one(dst_mm); + if (unlikely(!pgtable)) + goto out; + + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (unlikely(is_swap_pmd(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + if (!is_readable_migration_entry(entry)) { + entry = make_readable_migration_entry( + swp_offset(entry)); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(src_mm, addr, src_pmd, pmd); + } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } +#endif + + if (unlikely(!pmd_trans_huge(pmd))) { + pte_free(dst_mm, pgtable); + goto out_unlock; + } + /* + * When page table lock is held, the huge zero pmd should not be + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + mm_get_huge_zero_page(dst_mm); + goto out_zero_page; + } + + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + + get_page(src_page); + if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) { + /* Page maybe pinned: split and retry the fault on PTEs. */ + put_page(src_page); + pte_free(dst_mm, pgtable); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); +out_zero_page: + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); + pmdp_set_wrprotect(src_mm, addr, src_pmd); + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_clear_uffd_wp(pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); +out: + return ret; +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static void touch_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, bool write) +{ + pud_t _pud; + + _pud = pud_mkyoung(*pud); + if (write) + _pud = pud_mkdirty(_pud); + if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, + pud, _pud, write)) + update_mmu_cache_pud(vma, addr, pud); +} + +struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t *pud, int flags, struct dev_pagemap **pgmap) +{ + unsigned long pfn = pud_pfn(*pud); + struct mm_struct *mm = vma->vm_mm; + struct page *page; + + assert_spin_locked(pud_lockptr(mm, pud)); + + if (flags & FOLL_WRITE && !pud_write(*pud)) + return NULL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return NULL; + + if (pud_present(*pud) && pud_devmap(*pud)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pud(vma, addr, pud, flags & FOLL_WRITE); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + * + * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here: + */ + if (!(flags & (FOLL_GET | FOLL_PIN))) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; + *pgmap = get_dev_pagemap(pfn, *pgmap); + if (!*pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + if (!try_grab_page(page, flags)) + page = ERR_PTR(-ENOMEM); + + return page; +} + +int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + struct vm_area_struct *vma) +{ + spinlock_t *dst_ptl, *src_ptl; + pud_t pud; + int ret; + + dst_ptl = pud_lock(dst_mm, dst_pud); + src_ptl = pud_lockptr(src_mm, src_pud); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pud = *src_pud; + if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud))) + goto out_unlock; + + /* + * When page table lock is held, the huge zero pud should not be + * under splitting since we don't split the page itself, only pud to + * a page table. + */ + if (is_huge_zero_pud(pud)) { + /* No huge zero pud yet */ + } + + /* + * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() + * and split if duplicating fails. + */ + pudp_set_wrprotect(src_mm, addr, src_pud); + pud = pud_mkold(pud_wrprotect(pud)); + set_pud_at(dst_mm, addr, dst_pud, pud); + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + return ret; +} + +void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) +{ + bool write = vmf->flags & FAULT_FLAG_WRITE; + + vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); + if (unlikely(!pud_same(*vmf->pud, orig_pud))) + goto unlock; + + touch_pud(vmf->vma, vmf->address, vmf->pud, write); +unlock: + spin_unlock(vmf->ptl); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + +void huge_pmd_set_accessed(struct vm_fault *vmf) +{ + bool write = vmf->flags & FAULT_FLAG_WRITE; + + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) + goto unlock; + + touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); + +unlock: + spin_unlock(vmf->ptl); +} + +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) +{ + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + struct page *page; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + pmd_t orig_pmd = vmf->orig_pmd; + + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); + VM_BUG_ON_VMA(!vma->anon_vma, vma); + + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + + if (is_huge_zero_pmd(orig_pmd)) + goto fallback; + + spin_lock(vmf->ptl); + + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + return 0; + } + + page = pmd_page(orig_pmd); + folio = page_folio(page); + VM_BUG_ON_PAGE(!PageHead(page), page); + + /* Early check when only holding the PT lock. */ + if (PageAnonExclusive(page)) + goto reuse; + + if (!folio_trylock(folio)) { + folio_get(folio); + spin_unlock(vmf->ptl); + folio_lock(folio); + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); + folio_unlock(folio); + folio_put(folio); + return 0; + } + folio_put(folio); + } + + /* Recheck after temporarily dropping the PT lock. */ + if (PageAnonExclusive(page)) { + folio_unlock(folio); + goto reuse; + } + + /* + * See do_wp_page(): we can only reuse the folio exclusively if + * there are no additional references. Note that we always drain + * the LRU pagevecs immediately after adding a THP. + */ + if (folio_ref_count(folio) > + 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) + goto unlock_fallback; + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_ref_count(folio) == 1) { + pmd_t entry; + + page_move_anon_rmap(page, vma); + folio_unlock(folio); +reuse: + if (unlikely(unshare)) { + spin_unlock(vmf->ptl); + return 0; + } + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); + return VM_FAULT_WRITE; + } + +unlock_fallback: + folio_unlock(folio); + spin_unlock(vmf->ptl); +fallback: + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); + return VM_FAULT_FALLBACK; +} + +/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pmd is writable, we can write to the page. */ + if (pmd_write(pmd)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + return !userfaultfd_huge_pmd_wp(vma, pmd); +} + +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if ((flags & FOLL_WRITE) && + !can_follow_write_pmd(*pmd, page, vma, flags)) + return NULL; + + /* Avoid dumping huge zero page */ + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) + return ERR_PTR(-EFAULT); + + /* Full NUMA hinting faults to serialise migration in fault paths */ + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) + return NULL; + + if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + return ERR_PTR(-EMLINK); + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + + if (!try_grab_page(page, flags)) + return ERR_PTR(-ENOMEM); + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); + + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); + + return page; +} + +/* NUMA hinting page fault entry point for trans huge pmds */ +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + pmd_t oldpmd = vmf->orig_pmd; + pmd_t pmd; + struct page *page; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + int page_nid = NUMA_NO_NODE; + int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); + bool migrated = false; + bool was_writable = pmd_savedwrite(oldpmd); + int flags = 0; + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + goto out; + } + + pmd = pmd_modify(oldpmd, vma->vm_page_prot); + page = vm_normal_page_pmd(vma, haddr, pmd); + if (!page) + goto out_map; + + /* See similar comment in do_numa_page for explanation */ + if (!was_writable) + flags |= TNF_NO_GROUP; + + page_nid = page_to_nid(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if (node_is_toptier(page_nid)) + last_cpupid = page_cpupid_last(page); + target_nid = numa_migrate_prep(page, vma, haddr, page_nid, + &flags); + + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out_map; + } + + spin_unlock(vmf->ptl); + + migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { + flags |= TNF_MIGRATED; + page_nid = target_nid; + } else { + flags |= TNF_MIGRATE_FAIL; + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); + goto out; + } + goto out_map; + } + +out: + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + flags); + + return 0; + +out_map: + /* Restore the PMD */ + pmd = pmd_modify(oldpmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + if (was_writable) + pmd = pmd_mkwrite(pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); + goto out; +} + +/* + * Return true if we do MADV_FREE successfully on entire pmd page. + * Otherwise, return false. + */ +bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next) +{ + spinlock_t *ptl; + pmd_t orig_pmd; + struct page *page; + struct mm_struct *mm = tlb->mm; + bool ret = false; + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + goto out_unlocked; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto out; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto out; + } + + page = pmd_page(orig_pmd); + /* + * If other processes are mapping this page, we couldn't discard + * the page unless they all do MADV_FREE so let's skip the page. + */ + if (total_mapcount(page) != 1) + goto out; + + if (!trylock_page(page)) + goto out; + + /* + * If user want to discard part-pages of THP, split it so MADV_FREE + * will deactivate only them. + */ + if (next - addr != HPAGE_PMD_SIZE) { + get_page(page); + spin_unlock(ptl); + split_huge_page(page); + unlock_page(page); + put_page(page); + goto out_unlocked; + } + + if (PageDirty(page)) + ClearPageDirty(page); + unlock_page(page); + + if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + mark_page_lazyfree(page); + ret = true; +out: + spin_unlock(ptl); +out_unlocked: + return ret; +} + +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + mm_dec_nr_ptes(mm); +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr) +{ + pmd_t orig_pmd; + spinlock_t *ptl; + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, + tlb->fullmm); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + if (vma_is_special_huge(vma)) { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); + spin_unlock(ptl); + } else if (is_huge_zero_pmd(orig_pmd)) { + zap_deposited_table(tlb->mm, pmd); + spin_unlock(ptl); + } else { + struct page *page = NULL; + int flush_needed = 1; + + if (pmd_present(orig_pmd)) { + page = pmd_page(orig_pmd); + page_remove_rmap(page, vma, true); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + VM_BUG_ON_PAGE(!PageHead(page), page); + } else if (thp_migration_supported()) { + swp_entry_t entry; + + VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); + entry = pmd_to_swp_entry(orig_pmd); + page = pfn_swap_entry_to_page(entry); + flush_needed = 0; + } else + WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); + + if (PageAnon(page)) { + zap_deposited_table(tlb->mm, pmd); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); + add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); + } + + spin_unlock(ptl); + if (flush_needed) + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + } + return 1; +} + +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + +static pmd_t move_soft_dirty_pmd(pmd_t pmd) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (unlikely(is_pmd_migration_entry(pmd))) + pmd = pmd_swp_mksoft_dirty(pmd); + else if (pmd_present(pmd)) + pmd = pmd_mksoft_dirty(pmd); +#endif + return pmd; +} + +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + pmd_t pmd; + struct mm_struct *mm = vma->vm_mm; + bool force_flush = false; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) { + VM_BUG_ON(pmd_trans_huge(*new_pmd)); + return false; + } + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = __pmd_trans_huge_lock(old_pmd, vma); + if (old_ptl) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); + if (pmd_present(pmd)) + force_flush = true; + VM_BUG_ON(!pmd_none(*new_pmd)); + + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { + pgtable_t pgtable; + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + } + pmd = move_soft_dirty_pmd(pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd); + if (force_flush) + flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + return true; + } + return false; +} + +/* + * Returns + * - 0 if PMD could not be locked + * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary + * or if prot_numa but THP migration is not supported + * - HPAGE_PMD_NR if protections changed and TLB flush necessary + */ +int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pmd_t oldpmd, entry; + bool preserve_write; + int ret; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + + if (prot_numa && !thp_migration_supported()) + return 1; + + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (is_swap_pmd(*pmd)) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + struct page *page = pfn_swap_entry_to_page(entry); + pmd_t newpmd; + + VM_BUG_ON(!is_pmd_migration_entry(*pmd)); + if (is_writable_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*pmd)) + newpmd = pmd_swp_mksoft_dirty(newpmd); + if (pmd_swp_uffd_wp(*pmd)) + newpmd = pmd_swp_mkuffd_wp(newpmd); + } else { + newpmd = *pmd; + } + + if (uffd_wp) + newpmd = pmd_swp_mkuffd_wp(newpmd); + else if (uffd_wp_resolve) + newpmd = pmd_swp_clear_uffd_wp(newpmd); + if (!pmd_same(*pmd, newpmd)) + set_pmd_at(mm, addr, pmd, newpmd); + goto unlock; + } +#endif + + if (prot_numa) { + struct page *page; + bool toptier; + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (is_huge_zero_pmd(*pmd)) + goto unlock; + + if (pmd_protnone(*pmd)) + goto unlock; + + page = pmd_page(*pmd); + toptier = node_is_toptier(page_to_nid(page)); + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + toptier) + goto unlock; + + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, jiffies_to_msecs(jiffies)); + } + /* + * In case prot_numa, we are under mmap_read_lock(mm). It's critical + * to not clear pmd intermittently to avoid race with MADV_DONTNEED + * which is also under mmap_read_lock(mm): + * + * CPU0: CPU1: + * change_huge_pmd(prot_numa=1) + * pmdp_huge_get_and_clear_notify() + * madvise_dontneed() + * zap_pmd_range() + * pmd_trans_huge(*pmd) == 0 (without ptl) + * // skip the pmd + * set_pmd_at(); + * // pmd is re-established + * + * The race makes MADV_DONTNEED miss the huge pmd and don't clear it + * which may break userspace. + * + * pmdp_invalidate_ad() is required to make sure we don't miss + * dirty/young flags set by hardware. + */ + oldpmd = pmdp_invalidate_ad(vma, addr, pmd); + + entry = pmd_modify(oldpmd, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + if (uffd_wp) { + entry = pmd_wrprotect(entry); + entry = pmd_mkuffd_wp(entry); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled by PF interrupt + * handler, then things like COW could be properly + * handled. + */ + entry = pmd_clear_uffd_wp(entry); + } + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + + if (huge_pmd_needs_flush(oldpmd, entry)) + tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); + + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +unlock: + spin_unlock(ptl); + return ret; +} + +/* + * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. + * + * Note that if it returns page table lock pointer, this routine returns without + * unlocking page table lock. So callers must unlock it. + */ +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +{ + spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); + if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || + pmd_devmap(*pmd))) + return ptl; + spin_unlock(ptl); + return NULL; +} + +/* + * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. + * + * Note that if it returns page table lock pointer, this routine returns without + * unlocking page table lock. So callers must unlock it. + */ +spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) +{ + spinlock_t *ptl; + + ptl = pud_lock(vma->vm_mm, pud); + if (likely(pud_trans_huge(*pud) || pud_devmap(*pud))) + return ptl; + spin_unlock(ptl); + return NULL; +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pud, unsigned long addr) +{ + spinlock_t *ptl; + + ptl = __pud_trans_huge_lock(pud, vma); + if (!ptl) + return 0; + + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); + tlb_remove_pud_tlb_entry(tlb, pud, addr); + if (vma_is_special_huge(vma)) { + spin_unlock(ptl); + /* No zero page support yet */ + } else { + /* No support for anonymous PUD pages yet */ + BUG(); + } + return 1; +} + +static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, + unsigned long haddr) +{ + VM_BUG_ON(haddr & ~HPAGE_PUD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma); + VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud)); + + count_vm_event(THP_SPLIT_PUD); + + pudp_huge_clear_flush_notify(vma, haddr, pud); +} + +void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ + spinlock_t *ptl; + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, + (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pud_lock(vma->vm_mm, pud); + if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud))) + goto out; + __split_huge_pud_locked(vma, pud, range.start); + +out: + spin_unlock(ptl); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pudp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(&range); +} +#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd) +{ + struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable; + pmd_t _pmd, old_pmd; + int i; + + /* + * Leave pmd empty until pte is filled note that it is fine to delay + * notification until mmu_notifier_invalidate_range_end() as we are + * replacing a zero pmd write protected page with a zero pte write + * protected page. + * + * See Documentation/mm/mmu_notifier.rst + */ + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); + entry = pte_mkspecial(entry); + if (pmd_uffd_wp(old_pmd)) + entry = pte_mkuffd_wp(entry); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); +} + +static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long haddr, bool freeze) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + pgtable_t pgtable; + pmd_t old_pmd, _pmd; + bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; + bool anon_exclusive = false, dirty = false; + unsigned long addr; + int i; + + VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); + + count_vm_event(THP_SPLIT_PMD); + + if (!vma_is_anonymous(vma)) { + old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); + if (vma_is_special_huge(vma)) + return; + if (unlikely(is_pmd_migration_entry(old_pmd))) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(old_pmd); + page = pfn_swap_entry_to_page(entry); + } else { + page = pmd_page(old_pmd); + if (!PageDirty(page) && pmd_dirty(old_pmd)) + set_page_dirty(page); + if (!PageReferenced(page) && pmd_young(old_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, vma, true); + put_page(page); + } + add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); + return; + } + + if (is_huge_zero_pmd(*pmd)) { + /* + * FIXME: Do we want to invalidate secondary mmu by calling + * mmu_notifier_invalidate_range() see comments below inside + * __split_huge_pmd() ? + * + * We are going from a zero huge page write protected to zero + * small page also write protected so it does not seems useful + * to invalidate secondary mmu at this time. + */ + return __split_huge_zero_page_pmd(vma, haddr, pmd); + } + + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum + * 383 on page 105. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); + + pmd_migration = is_pmd_migration_entry(old_pmd); + if (unlikely(pmd_migration)) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(old_pmd); + page = pfn_swap_entry_to_page(entry); + write = is_writable_migration_entry(entry); + if (PageAnon(page)) + anon_exclusive = is_readable_exclusive_migration_entry(entry); + young = is_migration_entry_young(entry); + dirty = is_migration_entry_dirty(entry); + soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); + } else { + page = pmd_page(old_pmd); + if (pmd_dirty(old_pmd)) { + dirty = true; + SetPageDirty(page); + } + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); + uffd_wp = pmd_uffd_wp(old_pmd); + + VM_BUG_ON_PAGE(!page_count(page), page); + page_ref_add(page, HPAGE_PMD_NR - 1); + + /* + * Without "freeze", we'll simply split the PMD, propagating the + * PageAnonExclusive() flag for each PTE by setting it for + * each subpage -- no need to (temporarily) clear. + * + * With "freeze" we want to replace mapped pages by + * migration entries right away. This is only possible if we + * managed to clear PageAnonExclusive() -- see + * set_pmd_migration_entry(). + * + * In case we cannot clear PageAnonExclusive(), split the PMD + * only and let try_to_migrate_one() fail later. + * + * See page_try_share_anon_rmap(): invalidate PMD first. + */ + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) + freeze = false; + } + + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + if (freeze || pmd_migration) { + swp_entry_t swp_entry; + if (write) + swp_entry = make_writable_migration_entry( + page_to_pfn(page + i)); + else if (anon_exclusive) + swp_entry = make_readable_exclusive_migration_entry( + page_to_pfn(page + i)); + else + swp_entry = make_readable_migration_entry( + page_to_pfn(page + i)); + if (young) + swp_entry = make_migration_entry_young(swp_entry); + if (dirty) + swp_entry = make_migration_entry_dirty(swp_entry); + entry = swp_entry_to_pte(swp_entry); + if (soft_dirty) + entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); + } else { + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); + entry = maybe_mkwrite(entry, vma); + if (anon_exclusive) + SetPageAnonExclusive(page + i); + if (!write) + entry = pte_wrprotect(entry); + if (!young) + entry = pte_mkold(entry); + /* + * NOTE: we don't do pte_mkdirty when dirty==true + * because it breaks sparc64 which can sigsegv + * random process. Need to revisit when we figure + * out what is special with sparc64. + */ + if (soft_dirty) + entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + } + pte = pte_offset_map(&_pmd, addr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, entry); + if (!pmd_migration) + atomic_inc(&page[i]._mapcount); + pte_unmap(pte); + } + + if (!pmd_migration) { + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && + !TestSetPageDoubleMap(page)) { + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_inc(&page[i]._mapcount); + } + + lock_page_memcg(page); + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __mod_lruvec_page_state(page, NR_ANON_THPS, + -HPAGE_PMD_NR); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } + } + unlock_page_memcg(page); + + /* Above is effectively page_remove_rmap(page, vma, true) */ + munlock_vma_page(page, vma, true); + } + + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + + if (freeze) { + for (i = 0; i < HPAGE_PMD_NR; i++) { + page_remove_rmap(page + i, vma, false); + put_page(page + i); + } + } +} + +void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, bool freeze, struct folio *folio) +{ + spinlock_t *ptl; + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, + (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + ptl = pmd_lock(vma->vm_mm, pmd); + + /* + * If caller asks to setup a migration entry, we need a folio to check + * pmd against. Otherwise we can end up replacing wrong folio. + */ + VM_BUG_ON(freeze && !folio); + VM_WARN_ON_ONCE(folio && !folio_test_locked(folio)); + + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || + is_pmd_migration_entry(*pmd)) { + /* + * It's safe to call pmd_page when folio is set because it's + * guaranteed that pmd is present. + */ + if (folio && folio != page_folio(pmd_page(*pmd))) + goto out; + __split_huge_pmd_locked(vma, pmd, range.start, freeze); + } + +out: + spin_unlock(ptl); + /* + * No need to double call mmu_notifier->invalidate_range() callback. + * They are 3 cases to consider inside __split_huge_pmd_locked(): + * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious + * 2) __split_huge_zero_page_pmd() read only zero page and any write + * fault will trigger a flush_notify before pointing to a new page + * (it is fine if the secondary mmu keeps pointing to the old zero + * page in the meantime) + * 3) Split a huge pmd into pte pointing to the same page. No need + * to invalidate secondary tlb entry they are all still valid. + * any further changes to individual pte will notify. So no need + * to call mmu_notifier->invalidate_range() + */ + mmu_notifier_invalidate_range_only_end(&range); +} + +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct folio *folio) +{ + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); + + if (!pmd) + return; + + __split_huge_pmd(vma, pmd, address, freeze, folio); +} + +static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) +{ + /* + * If the new address isn't hpage aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && + range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), + ALIGN(address, HPAGE_PMD_SIZE))) + split_huge_pmd_address(vma, address, false, NULL); +} + +void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* Check if we need to split start first. */ + split_huge_pmd_if_needed(vma, start); + + /* Check if we need to split end next. */ + split_huge_pmd_if_needed(vma, end); + + /* + * If we're also updating the next vma vm_start, + * check if we need to split it. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); + unsigned long nstart = next->vm_start; + nstart += adjust_next; + split_huge_pmd_if_needed(next, nstart); + } +} + +static void unmap_folio(struct folio *folio) +{ + enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | + TTU_SYNC; + + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + + /* + * Anon pages need migration entries to preserve them, but file + * pages can simply be left unmapped, then faulted back on demand. + * If that is ever changed (perhaps for mlock), update remap_page(). + */ + if (folio_test_anon(folio)) + try_to_migrate(folio, ttu_flags); + else + try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); +} + +static void remap_page(struct folio *folio, unsigned long nr) +{ + int i = 0; + + /* If unmap_folio() uses try_to_migrate() on file, remove this check */ + if (!folio_test_anon(folio)) + return; + for (;;) { + remove_migration_ptes(folio, folio, true); + i += folio_nr_pages(folio); + if (i >= nr) + break; + folio = folio_next(folio); + } +} + +static void lru_add_page_tail(struct page *head, struct page *tail, + struct lruvec *lruvec, struct list_head *list) +{ + VM_BUG_ON_PAGE(!PageHead(head), head); + VM_BUG_ON_PAGE(PageCompound(tail), head); + VM_BUG_ON_PAGE(PageLRU(tail), head); + lockdep_assert_held(&lruvec->lru_lock); + + if (list) { + /* page reclaim is reclaiming a huge page */ + VM_WARN_ON(PageLRU(head)); + get_page(tail); + list_add_tail(&tail->lru, list); + } else { + /* head is still on lru (and we have it frozen) */ + VM_WARN_ON(!PageLRU(head)); + if (PageUnevictable(tail)) + tail->mlock_count = 0; + else + list_add_tail(&tail->lru, &head->lru); + SetPageLRU(tail); + } +} + +static void __split_huge_page_tail(struct page *head, int tail, + struct lruvec *lruvec, struct list_head *list) +{ + struct page *page_tail = head + tail; + + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); + + /* + * Clone page flags before unfreezing refcount. + * + * After successful get_page_unless_zero() might follow flags change, + * for example lock_page() which set PG_waiters. + * + * Note that for mapped sub-pages of an anonymous THP, + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in + * the migration entry instead from where remap_page() will restore it. + * We can still have PG_anon_exclusive set on effectively unmapped and + * unreferenced sub-pages of an anonymous THP: we can simply drop + * PG_anon_exclusive (-> PG_mappedtodisk) for these here. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_swapcache) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_workingset) | + (1L << PG_locked) | + (1L << PG_unevictable) | +#ifdef CONFIG_64BIT + (1L << PG_arch_2) | +#endif + (1L << PG_dirty) | + LRU_GEN_MASK | LRU_REFS_MASK)); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + page_tail->index = head->index + tail; + + /* + * page->private should not be set in tail pages with the exception + * of swap cache pages that store the swp_entry_t in tail pages. + * Fix up and warn once if private is unexpectedly set. + */ + if (!folio_test_swapcache(page_folio(head))) { + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + page_tail->private = 0; + } + + /* Page flags must be visible before we make the page non-compound. */ + smp_wmb(); + + /* + * Clear PageTail before unfreezing page refcount. + * + * After successful get_page_unless_zero() might follow put_page() + * which needs correct compound_head(). + */ + clear_compound_head(page_tail); + + /* Finally unfreeze refcount. Additional reference from page cache. */ + page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || + PageSwapCache(head))); + + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) + set_page_idle(page_tail); + + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ + lru_add_page_tail(head, page_tail, lruvec, list); +} + +static void __split_huge_page(struct page *page, struct list_head *list, + pgoff_t end) +{ + struct folio *folio = page_folio(page); + struct page *head = &folio->page; + struct lruvec *lruvec; + struct address_space *swap_cache = NULL; + unsigned long offset = 0; + unsigned int nr = thp_nr_pages(head); + int i; + + /* complete memcg works before add pages to LRU */ + split_page_memcg(head, nr); + + if (PageAnon(head) && PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + offset = swp_offset(entry); + swap_cache = swap_address_space(entry); + xa_lock(&swap_cache->i_pages); + } + + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ + lruvec = folio_lruvec_lock(folio); + + ClearPageHasHWPoisoned(head); + + for (i = nr - 1; i >= 1; i--) { + __split_huge_page_tail(head, i, lruvec, list); + /* Some pages can be beyond EOF: drop them from page cache */ + if (head[i].index >= end) { + struct folio *tail = page_folio(head + i); + + if (shmem_mapping(head->mapping)) + shmem_uncharge(head->mapping->host, 1); + else if (folio_test_clear_dirty(tail)) + folio_account_cleaned(tail, + inode_to_wb(folio->mapping->host)); + __filemap_remove_folio(tail, NULL); + folio_put(tail); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); + } else if (swap_cache) { + __xa_store(&swap_cache->i_pages, offset + i, + head + i, 0); + } + } + + ClearPageCompound(head); + unlock_page_lruvec(lruvec); + /* Caller disabled irqs, so they are still disabled here */ + + split_page_owner(head, nr); + + /* See comment in __split_huge_page_tail() */ + if (PageAnon(head)) { + /* Additional pin to swap cache */ + if (PageSwapCache(head)) { + page_ref_add(head, 2); + xa_unlock(&swap_cache->i_pages); + } else { + page_ref_inc(head); + } + } else { + /* Additional pin to page cache */ + page_ref_add(head, 2); + xa_unlock(&head->mapping->i_pages); + } + local_irq_enable(); + + remap_page(folio, nr); + + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + split_swap_cluster(entry); + } + + for (i = 0; i < nr; i++) { + struct page *subpage = head + i; + if (subpage == page) + continue; + unlock_page(subpage); + + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + free_page_and_swap_cache(subpage); + } +} + +/* Racy check whether the huge page can be split */ +bool can_split_folio(struct folio *folio, int *pextra_pins) +{ + int extra_pins; + + /* Additional pins from page cache */ + if (folio_test_anon(folio)) + extra_pins = folio_test_swapcache(folio) ? + folio_nr_pages(folio) : 0; + else + extra_pins = folio_nr_pages(folio); + if (pextra_pins) + *pextra_pins = extra_pins; + return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1; +} + +/* + * This function splits huge page into normal pages. @page can point to any + * subpage of huge page to split. Split doesn't change the position of @page. + * + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. + * The huge page must be locked. + * + * If @list is null, tail pages will be added to LRU list, otherwise, to @list. + * + * Both head page and tail pages will inherit mapping, flags, and so on from + * the hugepage. + * + * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if + * they are not mapped. + * + * Returns 0 if the hugepage is split successfully. + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under + * us. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) +{ + struct folio *folio = page_folio(page); + struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; + int extra_pins, ret; + pgoff_t end; + bool is_hzp; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + + is_hzp = is_huge_zero_page(&folio->page); + if (is_hzp) { + pr_warn_ratelimited("Called split_huge_page for huge zero page\n"); + return -EBUSY; + } + + if (folio_test_writeback(folio)) + return -EBUSY; + + if (folio_test_anon(folio)) { + /* + * The caller does not necessarily hold an mmap_lock that would + * prevent the anon_vma disappearing so we first we take a + * reference to it and then lock the anon_vma for write. This + * is similar to folio_lock_anon_vma_read except the write lock + * is taken to serialise against parallel split or collapse + * operations. + */ + anon_vma = folio_get_anon_vma(folio); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + end = -1; + mapping = NULL; + anon_vma_lock_write(anon_vma); + } else { + gfp_t gfp; + + mapping = folio->mapping; + + /* Truncated ? */ + if (!mapping) { + ret = -EBUSY; + goto out; + } + + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + + if (!filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } + + xas_split_alloc(&xas, folio, folio_order(folio), gfp); + if (xas_error(&xas)) { + ret = xas_error(&xas); + goto out; + } + + anon_vma = NULL; + i_mmap_lock_read(mapping); + + /* + *__split_huge_page() may need to trim off pages beyond EOF: + * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, + * which cannot be nested inside the page tree lock. So note + * end now: i_size itself may be changed at any moment, but + * folio lock is good enough to serialize the trimming. + */ + end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (shmem_mapping(mapping)) + end = shmem_fallocend(mapping->host, end); + } + + /* + * Racy check if we can split the page, before unmap_folio() will + * split PMDs + */ + if (!can_split_folio(folio, &extra_pins)) { + ret = -EBUSY; + goto out_unlock; + } + + unmap_folio(folio); + + /* block interrupt reentry in xa_lock and spinlock */ + local_irq_disable(); + if (mapping) { + /* + * Check if the folio is present in page cache. + * We assume all tail are present too, if folio is there. + */ + xas_lock(&xas); + xas_reset(&xas); + if (xas_load(&xas) != folio) + goto fail; + } + + /* Prevent deferred_split_scan() touching ->_refcount */ + spin_lock(&ds_queue->split_queue_lock); + if (folio_ref_freeze(folio, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(&folio->page))) { + ds_queue->split_queue_len--; + list_del(page_deferred_list(&folio->page)); + } + spin_unlock(&ds_queue->split_queue_lock); + if (mapping) { + int nr = folio_nr_pages(folio); + + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_pmd_mappable(folio)) { + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, + NR_SHMEM_THPS, -nr); + } else { + __lruvec_stat_mod_folio(folio, + NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + } + } + } + + __split_huge_page(page, list, end); + ret = 0; + } else { + spin_unlock(&ds_queue->split_queue_lock); +fail: + if (mapping) + xas_unlock(&xas); + local_irq_enable(); + remap_page(folio, folio_nr_pages(folio)); + ret = -EBUSY; + } + +out_unlock: + if (anon_vma) { + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); + } + if (mapping) + i_mmap_unlock_read(mapping); +out: + xas_destroy(&xas); + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + return ret; +} + +void free_transhuge_page(struct page *page) +{ + struct deferred_split *ds_queue = get_deferred_split_queue(page); + unsigned long flags; + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (!list_empty(page_deferred_list(page))) { + ds_queue->split_queue_len--; + list_del(page_deferred_list(page)); + } + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + free_compound_page(page); +} + +void deferred_split_huge_page(struct page *page) +{ + struct deferred_split *ds_queue = get_deferred_split_queue(page); +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg = page_memcg(compound_head(page)); +#endif + unsigned long flags; + + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + + /* + * The try_to_unmap() in page reclaim path might reach here too, + * this may cause a race condition to corrupt deferred split queue. + * And, if page reclaim is already handling the same page, it is + * unnecessary to handle it again in shrinker. + * + * Check PageSwapCache to determine if the page is being + * handled by page reclaim since THP swap would add the page into + * swap cache before calling try_to_unmap(). + */ + if (PageSwapCache(page)) + return; + + if (!list_empty(page_deferred_list(page))) + return; + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); + list_add_tail(page_deferred_list(page), &ds_queue->split_queue); + ds_queue->split_queue_len++; +#ifdef CONFIG_MEMCG + if (memcg) + set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); +#endif + } + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); +} + +static unsigned long deferred_split_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct pglist_data *pgdata = NODE_DATA(sc->nid); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + return READ_ONCE(ds_queue->split_queue_len); +} + +static unsigned long deferred_split_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct pglist_data *pgdata = NODE_DATA(sc->nid); + struct deferred_split *ds_queue = &pgdata->deferred_split_queue; + unsigned long flags; + LIST_HEAD(list), *pos, *next; + struct page *page; + int split = 0; + +#ifdef CONFIG_MEMCG + if (sc->memcg) + ds_queue = &sc->memcg->deferred_split_queue; +#endif + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + /* Take pin on all head pages to avoid freeing them under us */ + list_for_each_safe(pos, next, &ds_queue->split_queue) { + page = list_entry((void *)pos, struct page, deferred_list); + page = compound_head(page); + if (get_page_unless_zero(page)) { + list_move(page_deferred_list(page), &list); + } else { + /* We lost race with put_compound_page() */ + list_del_init(page_deferred_list(page)); + ds_queue->split_queue_len--; + } + if (!--sc->nr_to_scan) + break; + } + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, deferred_list); + if (!trylock_page(page)) + goto next; + /* split_huge_page() removes page from list on success */ + if (!split_huge_page(page)) + split++; + unlock_page(page); +next: + put_page(page); + } + + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + list_splice_tail(&list, &ds_queue->split_queue); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. + */ + if (!split && list_empty(&ds_queue->split_queue)) + return SHRINK_STOP; + return split; +} + +static struct shrinker deferred_split_shrinker = { + .count_objects = deferred_split_count, + .scan_objects = deferred_split_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | + SHRINKER_NONSLAB, +}; + +#ifdef CONFIG_DEBUG_FS +static void split_huge_pages_all(void) +{ + struct zone *zone; + struct page *page; + unsigned long pfn, max_zone_pfn; + unsigned long total = 0, split = 0; + + pr_debug("Split all THPs\n"); + for_each_zone(zone) { + if (!managed_zone(zone)) + continue; + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + int nr_pages; + + page = pfn_to_online_page(pfn); + if (!page || !get_page_unless_zero(page)) + continue; + + if (zone != page_zone(page)) + goto next; + + if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) + goto next; + + total++; + lock_page(page); + nr_pages = thp_nr_pages(page); + if (!split_huge_page(page)) + split++; + pfn += nr_pages - 1; + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + } + + pr_debug("%lu of %lu THP split\n", split, total); +} + +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); +} + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = vma_lookup(mm, addr); + struct page *page; + + if (!vma) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + + if (IS_ERR_OR_NULL(page)) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_folio(page_folio(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; +} + +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, + pgoff_t off_end) +{ + struct filename *file; + struct file *candidate; + struct address_space *mapping; + int ret = -EINVAL; + pgoff_t index; + int nr_pages = 1; + unsigned long total = 0, split = 0; + + file = getname_kernel(file_path); + if (IS_ERR(file)) + return ret; + + candidate = file_open_name(file, O_RDONLY, 0); + if (IS_ERR(candidate)) + goto out; + + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", + file_path, off_start, off_end); + + mapping = candidate->f_mapping; + + for (index = off_start; index < off_end; index += nr_pages) { + struct page *fpage = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + nr_pages = 1; + if (xa_is_value(fpage) || !fpage) + continue; + + if (!is_transparent_hugepage(fpage)) + goto next; + + total++; + nr_pages = thp_nr_pages(fpage); + + if (!trylock_page(fpage)) + goto next; + + if (!split_huge_page(fpage)) + split++; + + unlock_page(fpage); +next: + put_page(fpage); + cond_resched(); + } + + filp_close(candidate, NULL); + ret = 0; + + pr_debug("%lu of %lu file-backed THP split\n", split, total); +out: + putname(file); + return ret; +} + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + char input_buf[MAX_INPUT_BUF_SZ]; + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + + if (input_buf[0] == '/') { + char *tok; + char *buf = input_buf; + char file_path[MAX_INPUT_BUF_SZ]; + pgoff_t off_start = 0, off_end = 0; + size_t input_len = strlen(input_buf); + + tok = strsep(&buf, ","); + if (tok) { + strcpy(file_path, tok); + } else { + ret = -EINVAL; + goto out; + } + + ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); + if (ret != 2) { + ret = -EINVAL; + goto out; + } + ret = split_huge_pages_in_file(file_path, off_start, off_end); + if (!ret) + ret = input_len; + + goto out; + } + + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; + +static int __init split_huge_pages_debugfs(void) +{ + debugfs_create_file("split_huge_pages", 0200, NULL, NULL, + &split_huge_pages_fops); + return 0; +} +late_initcall(split_huge_pages_debugfs); +#endif + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + bool anon_exclusive; + pmd_t pmdval; + swp_entry_t entry; + pmd_t pmdswp; + + if (!(pvmw->pmd && !pvmw->pte)) + return 0; + + flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); + pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + + /* See page_try_share_anon_rmap(): invalidate PMD first. */ + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (anon_exclusive && page_try_share_anon_rmap(page)) { + set_pmd_at(mm, address, pvmw->pmd, pmdval); + return -EBUSY; + } + + if (pmd_dirty(pmdval)) + set_page_dirty(page); + if (pmd_write(pmdval)) + entry = make_writable_migration_entry(page_to_pfn(page)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); + else + entry = make_readable_migration_entry(page_to_pfn(page)); + if (pmd_young(pmdval)) + entry = make_migration_entry_young(entry); + if (pmd_dirty(pmdval)) + entry = make_migration_entry_dirty(entry); + pmdswp = swp_entry_to_pmd(entry); + if (pmd_soft_dirty(pmdval)) + pmdswp = pmd_swp_mksoft_dirty(pmdswp); + if (pmd_uffd_wp(pmdval)) + pmdswp = pmd_swp_mkuffd_wp(pmdswp); + set_pmd_at(mm, address, pvmw->pmd, pmdswp); + page_remove_rmap(page, vma, true); + put_page(page); + trace_set_migration_pmd(address, pmd_val(pmdswp)); + + return 0; +} + +void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long address = pvmw->address; + unsigned long haddr = address & HPAGE_PMD_MASK; + pmd_t pmde; + swp_entry_t entry; + + if (!(pvmw->pmd && !pvmw->pte)) + return; + + entry = pmd_to_swp_entry(*pvmw->pmd); + get_page(new); + pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)); + if (pmd_swp_soft_dirty(*pvmw->pmd)) + pmde = pmd_mksoft_dirty(pmde); + if (pmd_swp_uffd_wp(*pvmw->pmd)) + pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); + if (!is_migration_entry_young(entry)) + pmde = pmd_mkold(pmde); + /* NOTE: this may contain setting soft-dirty on some archs */ + if (PageDirty(new) && is_migration_entry_dirty(entry)) + pmde = pmd_mkdirty(pmde); + if (is_writable_migration_entry(entry)) + pmde = maybe_pmd_mkwrite(pmde, vma); + else + pmde = pmd_wrprotect(pmde); + + if (PageAnon(new)) { + rmap_t rmap_flags = RMAP_COMPOUND; + + if (!is_readable_migration_entry(entry)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(new, vma, haddr, rmap_flags); + } else { + page_add_file_rmap(new, vma, true); + } + VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); + set_pmd_at(mm, haddr, pvmw->pmd, pmde); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache_pmd(vma, address, pvmw->pmd); + trace_remove_migration_pmd(address, pmd_val(pmde)); +} +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c new file mode 100644 index 000000000..37288a7f0 --- /dev/null +++ b/mm/hugetlb.c @@ -0,0 +1,7698 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Generic hugetlb support. + * (C) Nadia Yvette Chambers, April 2004 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include "internal.h" +#include "hugetlb_vmemmap.h" + +int hugetlb_max_hstate __read_mostly; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +#ifdef CONFIG_CMA +static struct cma *hugetlb_cma[MAX_NUMNODES]; +static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; +static bool hugetlb_cma_page(struct page *page, unsigned int order) +{ + return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + 1 << order); +} +#else +static bool hugetlb_cma_page(struct page *page, unsigned int order) +{ + return false; +} +#endif +static unsigned long hugetlb_cma_size __initdata; + +__initdata LIST_HEAD(huge_boot_pages); + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; +static bool __initdata parsed_valid_hugepagesz = true; +static bool __initdata parsed_default_hugepagesz; +static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; + +/* + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. + */ +DEFINE_SPINLOCK(hugetlb_lock); + +/* + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. + */ +static int num_fault_mutexes; +struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; + +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static struct resv_map *vma_resv_map(struct vm_area_struct *vma); + +static inline bool subpool_is_free(struct hugepage_subpool *spool) +{ + if (spool->count) + return false; + if (spool->max_hpages != -1) + return spool->used_hpages == 0; + if (spool->min_hpages != -1) + return spool->rsv_hpages == spool->min_hpages; + + return true; +} + +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) +{ + spin_unlock_irqrestore(&spool->lock, irq_flags); + + /* If no pages are used, and no other handles to the subpool + * remain, give up any reservations based on minimum size and + * free the subpool */ + if (subpool_is_free(spool)) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); + kfree(spool); + } +} + +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) +{ + struct hugepage_subpool *spool; + + spool = kzalloc(sizeof(*spool), GFP_KERNEL); + if (!spool) + return NULL; + + spin_lock_init(&spool->lock); + spool->count = 1; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; + + return spool; +} + +void hugepage_put_subpool(struct hugepage_subpool *spool) +{ + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); + BUG_ON(!spool->count); + spool->count--; + unlock_or_release_subpool(spool, flags); +} + +/* + * Subpool accounting for allocating and reserving pages. + * Return -ENOMEM if there are not enough resources to satisfy the + * request. Otherwise, return the number of pages by which the + * global pools must be adjusted (upward). The returned value may + * only be different than the passed value (delta) in the case where + * a subpool minimum size must be maintained. + */ +static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, + long delta) +{ + long ret = delta; + + if (!spool) + return ret; + + spin_lock_irq(&spool->lock); + + if (spool->max_hpages != -1) { /* maximum size accounting */ + if ((spool->used_hpages + delta) <= spool->max_hpages) + spool->used_hpages += delta; + else { + ret = -ENOMEM; + goto unlock_ret; + } + } + + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->rsv_hpages) { + if (delta > spool->rsv_hpages) { + /* + * Asking for more reserves than those already taken on + * behalf of subpool. Return difference. + */ + ret = delta - spool->rsv_hpages; + spool->rsv_hpages = 0; + } else { + ret = 0; /* reserves already accounted for */ + spool->rsv_hpages -= delta; + } + } + +unlock_ret: + spin_unlock_irq(&spool->lock); + return ret; +} + +/* + * Subpool accounting for freeing and unreserving pages. + * Return the number of global page reservations that must be dropped. + * The return value may only be different than the passed value (delta) + * in the case where a subpool minimum size must be maintained. + */ +static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, + long delta) +{ + long ret = delta; + unsigned long flags; + + if (!spool) + return delta; + + spin_lock_irqsave(&spool->lock, flags); + + if (spool->max_hpages != -1) /* maximum size accounting */ + spool->used_hpages -= delta; + + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { + if (spool->rsv_hpages + delta <= spool->min_hpages) + ret = 0; + else + ret = spool->rsv_hpages + delta - spool->min_hpages; + + spool->rsv_hpages += delta; + if (spool->rsv_hpages > spool->min_hpages) + spool->rsv_hpages = spool->min_hpages; + } + + /* + * If hugetlbfs_put_super couldn't free spool due to an outstanding + * quota reference, free it now. + */ + unlock_or_release_subpool(spool, flags); + + return ret; +} + +static inline struct hugepage_subpool *subpool_inode(struct inode *inode) +{ + return HUGETLBFS_SB(inode->i_sb)->spool; +} + +static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) +{ + return subpool_inode(file_inode(vma->vm_file)); +} + +/* + * hugetlb vma_lock helper routines + */ +static bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + down_read(&resv_map->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + up_read(&resv_map->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + down_write(&resv_map->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + up_write(&resv_map->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + return down_write_trylock(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + return down_write_trylock(&resv_map->rw_sema); + } + + return 1; +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + lockdep_assert_held(&resv_map->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +{ + struct vm_area_struct *vma = vma_lock->vma; + + /* + * vma_lock structure may or not be released as a result of put, + * it certainly will no longer be attached to vma so clear pointer. + * Semaphore synchronizes access to vma_lock->vma field. + */ + vma_lock->vma = NULL; + vma->vm_private_data = NULL; + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); +} + +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + __hugetlb_vma_unlock_write_put(vma_lock); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + /* no free for anon vmas, but still need to unlock */ + up_write(&resv_map->rw_sema); + } +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. + */ + if (!vma || !__vma_shareable_lock(vma)) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) { + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. This is only a possible + * performance enhancement and memory saving issue. + * However, the lock is also used to synchronize page + * faults with truncation. If the lock is not present, + * unlikely races could leave pages in a file past i_size + * until the file is removed. Warn in the unlikely case of + * allocation failure. + */ + pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); + return; + } + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + +/* Helper that removes a struct file_region from the resv_map cache and returns + * it for use. + */ +static struct file_region * +get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) +{ + struct file_region *nrg; + + VM_BUG_ON(resv->region_cache_count <= 0); + + resv->region_cache_count--; + nrg = list_first_entry(&resv->region_cache, struct file_region, link); + list_del(&nrg->link); + + nrg->from = from; + nrg->to = to; + + return nrg; +} + +static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg, + struct file_region *rg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + nrg->reservation_counter = rg->reservation_counter; + nrg->css = rg->css; + if (rg->css) + css_get(rg->css); +#endif +} + +/* Helper that records hugetlb_cgroup uncharge info. */ +static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg, + struct hstate *h, + struct resv_map *resv, + struct file_region *nrg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (h_cg) { + nrg->reservation_counter = + &h_cg->rsvd_hugepage[hstate_index(h)]; + nrg->css = &h_cg->css; + /* + * The caller will hold exactly one h_cg->css reference for the + * whole contiguous reservation region. But this area might be + * scattered when there are already some file_regions reside in + * it. As a result, many file_regions may share only one css + * reference. In order to ensure that one file_region must hold + * exactly one h_cg->css reference, we should do css_get for + * each file_region and leave the reference held by caller + * untouched. + */ + css_get(&h_cg->css); + if (!resv->pages_per_hpage) + resv->pages_per_hpage = pages_per_huge_page(h); + /* pages_per_hpage should be the same for all entries in + * a resv_map. + */ + VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h)); + } else { + nrg->reservation_counter = NULL; + nrg->css = NULL; + } +#endif +} + +static void put_uncharge_info(struct file_region *rg) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (rg->css) + css_put(rg->css); +#endif +} + +static bool has_same_uncharge_info(struct file_region *rg, + struct file_region *org) +{ +#ifdef CONFIG_CGROUP_HUGETLB + return rg->reservation_counter == org->reservation_counter && + rg->css == org->css; + +#else + return true; +#endif +} + +static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) +{ + struct file_region *nrg, *prg; + + prg = list_prev_entry(rg, link); + if (&prg->link != &resv->regions && prg->to == rg->from && + has_same_uncharge_info(prg, rg)) { + prg->to = rg->to; + + list_del(&rg->link); + put_uncharge_info(rg); + kfree(rg); + + rg = prg; + } + + nrg = list_next_entry(rg, link); + if (&nrg->link != &resv->regions && nrg->from == rg->to && + has_same_uncharge_info(nrg, rg)) { + nrg->from = rg->from; + + list_del(&rg->link); + put_uncharge_info(rg); + kfree(rg); + } +} + +static inline long +hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from, + long to, struct hstate *h, struct hugetlb_cgroup *cg, + long *regions_needed) +{ + struct file_region *nrg; + + if (!regions_needed) { + nrg = get_file_region_entry_from_cache(map, from, to); + record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg); + list_add(&nrg->link, rg); + coalesce_file_region(map, nrg); + } else + *regions_needed += 1; + + return to - from; +} + +/* + * Must be called with resv->lock held. + * + * Calling this with regions_needed != NULL will count the number of pages + * to be added but will not modify the linked list. And regions_needed will + * indicate the number of file_regions needed in the cache to carry out to add + * the regions for this range. + */ +static long add_reservation_in_range(struct resv_map *resv, long f, long t, + struct hugetlb_cgroup *h_cg, + struct hstate *h, long *regions_needed) +{ + long add = 0; + struct list_head *head = &resv->regions; + long last_accounted_offset = f; + struct file_region *iter, *trg = NULL; + struct list_head *rg = NULL; + + if (regions_needed) + *regions_needed = 0; + + /* In this loop, we essentially handle an entry for the range + * [last_accounted_offset, iter->from), at every iteration, with some + * bounds checking. + */ + list_for_each_entry_safe(iter, trg, head, link) { + /* Skip irrelevant regions that start before our range. */ + if (iter->from < f) { + /* If this region ends after the last accounted offset, + * then we need to update last_accounted_offset. + */ + if (iter->to > last_accounted_offset) + last_accounted_offset = iter->to; + continue; + } + + /* When we find a region that starts beyond our range, we've + * finished. + */ + if (iter->from >= t) { + rg = iter->link.prev; + break; + } + + /* Add an entry for last_accounted_offset -> iter->from, and + * update last_accounted_offset. + */ + if (iter->from > last_accounted_offset) + add += hugetlb_resv_map_add(resv, iter->link.prev, + last_accounted_offset, + iter->from, h, h_cg, + regions_needed); + + last_accounted_offset = iter->to; + } + + /* Handle the case where our range extends beyond + * last_accounted_offset. + */ + if (!rg) + rg = head->prev; + if (last_accounted_offset < t) + add += hugetlb_resv_map_add(resv, rg, last_accounted_offset, + t, h, h_cg, regions_needed); + + return add; +} + +/* Must be called with resv->lock acquired. Will drop lock to allocate entries. + */ +static int allocate_file_region_entries(struct resv_map *resv, + int regions_needed) + __must_hold(&resv->lock) +{ + LIST_HEAD(allocated_regions); + int to_allocate = 0, i = 0; + struct file_region *trg = NULL, *rg = NULL; + + VM_BUG_ON(regions_needed < 0); + + /* + * Check for sufficient descriptors in the cache to accommodate + * the number of in progress add operations plus regions_needed. + * + * This is a while loop because when we drop the lock, some other call + * to region_add or region_del may have consumed some region_entries, + * so we keep looping here until we finally have enough entries for + * (adds_in_progress + regions_needed). + */ + while (resv->region_cache_count < + (resv->adds_in_progress + regions_needed)) { + to_allocate = resv->adds_in_progress + regions_needed - + resv->region_cache_count; + + /* At this point, we should have enough entries in the cache + * for all the existing adds_in_progress. We should only be + * needing to allocate for regions_needed. + */ + VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); + + spin_unlock(&resv->lock); + for (i = 0; i < to_allocate; i++) { + trg = kmalloc(sizeof(*trg), GFP_KERNEL); + if (!trg) + goto out_of_memory; + list_add(&trg->link, &allocated_regions); + } + + spin_lock(&resv->lock); + + list_splice(&allocated_regions, &resv->region_cache); + resv->region_cache_count += to_allocate; + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(rg, trg, &allocated_regions, link) { + list_del(&rg->link); + kfree(rg); + } + return -ENOMEM; +} + +/* + * Add the huge page range represented by [f, t) to the reserve + * map. Regions will be taken from the cache to fill in this range. + * Sufficient regions should exist in the cache due to the previous + * call to region_chg with the same range, but in some cases the cache will not + * have sufficient entries due to races with other code doing region_add or + * region_del. The extra needed entries will be allocated. + * + * regions_needed is the out value provided by a previous call to region_chg. + * + * Return the number of new huge pages added to the map. This number is greater + * than or equal to zero. If file_region entries needed to be allocated for + * this operation and we were not able to allocate, it returns -ENOMEM. + * region_add of regions of length 1 never allocate file_regions and cannot + * fail; region_chg will always allocate at least 1 entry and a region_add for + * 1 page will only require at most 1 entry. + */ +static long region_add(struct resv_map *resv, long f, long t, + long in_regions_needed, struct hstate *h, + struct hugetlb_cgroup *h_cg) +{ + long add = 0, actual_regions_needed = 0; + + spin_lock(&resv->lock); +retry: + + /* Count how many regions are actually needed to execute this add. */ + add_reservation_in_range(resv, f, t, NULL, NULL, + &actual_regions_needed); + + /* + * Check for sufficient descriptors in the cache to accommodate + * this add operation. Note that actual_regions_needed may be greater + * than in_regions_needed, as the resv_map may have been modified since + * the region_chg call. In this case, we need to make sure that we + * allocate extra entries, such that we have enough for all the + * existing adds_in_progress, plus the excess needed for this + * operation. + */ + if (actual_regions_needed > in_regions_needed && + resv->region_cache_count < + resv->adds_in_progress + + (actual_regions_needed - in_regions_needed)) { + /* region_add operation of range 1 should never need to + * allocate file_region entries. + */ + VM_BUG_ON(t - f <= 1); + + if (allocate_file_region_entries( + resv, actual_regions_needed - in_regions_needed)) { + return -ENOMEM; + } + + goto retry; + } + + add = add_reservation_in_range(resv, f, t, h_cg, h, NULL); + + resv->adds_in_progress -= in_regions_needed; + + spin_unlock(&resv->lock); + return add; +} + +/* + * Examine the existing reserve map and determine how many + * huge pages in the specified range [f, t) are NOT currently + * represented. This routine is called before a subsequent + * call to region_add that will actually modify the reserve + * map to add the specified range [f, t). region_chg does + * not change the number of huge pages represented by the + * map. A number of new file_region structures is added to the cache as a + * placeholder, for the subsequent region_add call to use. At least 1 + * file_region structure is added. + * + * out_regions_needed is the number of regions added to the + * resv->adds_in_progress. This value needs to be provided to a follow up call + * to region_add or region_abort for proper accounting. + * + * Returns the number of huge pages that need to be added to the existing + * reservation map for the range [f, t). This number is greater or equal to + * zero. -ENOMEM is returned if a new file_region structure or cache entry + * is needed and can not be allocated. + */ +static long region_chg(struct resv_map *resv, long f, long t, + long *out_regions_needed) +{ + long chg = 0; + + spin_lock(&resv->lock); + + /* Count how many hugepages in this range are NOT represented. */ + chg = add_reservation_in_range(resv, f, t, NULL, NULL, + out_regions_needed); + + if (*out_regions_needed == 0) + *out_regions_needed = 1; + + if (allocate_file_region_entries(resv, *out_regions_needed)) + return -ENOMEM; + + resv->adds_in_progress += *out_regions_needed; + + spin_unlock(&resv->lock); + return chg; +} + +/* + * Abort the in progress add operation. The adds_in_progress field + * of the resv_map keeps track of the operations in progress between + * calls to region_chg and region_add. Operations are sometimes + * aborted after the call to region_chg. In such cases, region_abort + * is called to decrement the adds_in_progress counter. regions_needed + * is the value returned by the region_chg call, it is used to decrement + * the adds_in_progress counter. + * + * NOTE: The range arguments [f, t) are not needed or used in this + * routine. They are kept to make reading the calling code easier as + * arguments will match the associated region_chg call. + */ +static void region_abort(struct resv_map *resv, long f, long t, + long regions_needed) +{ + spin_lock(&resv->lock); + VM_BUG_ON(!resv->region_cache_count); + resv->adds_in_progress -= regions_needed; + spin_unlock(&resv->lock); +} + +/* + * Delete the specified range [f, t) from the reserve map. If the + * t parameter is LONG_MAX, this indicates that ALL regions after f + * should be deleted. Locate the regions which intersect [f, t) + * and either trim, delete or split the existing regions. + * + * Returns the number of huge pages deleted from the reserve map. + * In the normal case, the return value is zero or more. In the + * case where a region must be split, a new region descriptor must + * be allocated. If the allocation fails, -ENOMEM will be returned. + * NOTE: If the parameter t == LONG_MAX, then we will never split + * a region and possibly return -ENOMEM. Callers specifying + * t == LONG_MAX do not need to check for -ENOMEM error. + */ +static long region_del(struct resv_map *resv, long f, long t) +{ + struct list_head *head = &resv->regions; + struct file_region *rg, *trg; + struct file_region *nrg = NULL; + long del = 0; + +retry: + spin_lock(&resv->lock); + list_for_each_entry_safe(rg, trg, head, link) { + /* + * Skip regions before the range to be deleted. file_region + * ranges are normally of the form [from, to). However, there + * may be a "placeholder" entry in the map which is of the form + * (from, to) with from == to. Check for placeholder entries + * at the beginning of the range to be deleted. + */ + if (rg->to <= f && (rg->to != rg->from || rg->to != f)) + continue; + + if (rg->from >= t) + break; + + if (f > rg->from && t < rg->to) { /* Must split region */ + /* + * Check for an entry in the cache before dropping + * lock and attempting allocation. + */ + if (!nrg && + resv->region_cache_count > resv->adds_in_progress) { + nrg = list_first_entry(&resv->region_cache, + struct file_region, + link); + list_del(&nrg->link); + resv->region_cache_count--; + } + + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + goto retry; + } + + del += t - f; + hugetlb_cgroup_uncharge_file_region( + resv, rg, t - f, false); + + /* New entry for end of split region */ + nrg->from = t; + nrg->to = rg->to; + + copy_hugetlb_cgroup_uncharge_info(nrg, rg); + + INIT_LIST_HEAD(&nrg->link); + + /* Original entry is trimmed */ + rg->to = f; + + list_add(&nrg->link, &rg->link); + nrg = NULL; + break; + } + + if (f <= rg->from && t >= rg->to) { /* Remove entire region */ + del += rg->to - rg->from; + hugetlb_cgroup_uncharge_file_region(resv, rg, + rg->to - rg->from, true); + list_del(&rg->link); + kfree(rg); + continue; + } + + if (f <= rg->from) { /* Trim beginning of region */ + hugetlb_cgroup_uncharge_file_region(resv, rg, + t - rg->from, false); + + del += t - rg->from; + rg->from = t; + } else { /* Trim end of region */ + hugetlb_cgroup_uncharge_file_region(resv, rg, + rg->to - f, false); + + del += rg->to - f; + rg->to = f; + } + } + + spin_unlock(&resv->lock); + kfree(nrg); + return del; +} + +/* + * A rare out of memory error was encountered which prevented removal of + * the reserve map region for a page. The huge page itself was free'ed + * and removed from the page cache. This routine will adjust the subpool + * usage count, and the global reserve count if needed. By incrementing + * these counts, the reserve map entry which could not be deleted will + * appear as a "reserved" entry instead of simply dangling with incorrect + * counts. + */ +void hugetlb_fix_reserve_counts(struct inode *inode) +{ + struct hugepage_subpool *spool = subpool_inode(inode); + long rsv_adjust; + bool reserved = false; + + rsv_adjust = hugepage_subpool_get_pages(spool, 1); + if (rsv_adjust > 0) { + struct hstate *h = hstate_inode(inode); + + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; + } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); +} + +/* + * Count and return the number of huge pages in the reserve map + * that intersect with the range [f, t). + */ +static long region_count(struct resv_map *resv, long f, long t) +{ + struct list_head *head = &resv->regions; + struct file_region *rg; + long chg = 0; + + spin_lock(&resv->lock); + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + long seg_from; + long seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + spin_unlock(&resv->lock); + + return chg; +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} +EXPORT_SYMBOL_GPL(linear_hugepage_index); + +/* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + if (vma->vm_ops && vma->vm_ops->pagesize) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; +} +EXPORT_SYMBOL_GPL(vma_kernel_pagesize); + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific 'strong' + * version of this symbol is required. + */ +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} + +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of hugetlb_dup_vma_private() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. + */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + +static void +resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map, + struct hugetlb_cgroup *h_cg, + struct hstate *h) +{ +#ifdef CONFIG_CGROUP_HUGETLB + if (!h_cg || !h) { + resv_map->reservation_counter = NULL; + resv_map->pages_per_hpage = 0; + resv_map->css = NULL; + } else { + resv_map->reservation_counter = + &h_cg->rsvd_hugepage[hstate_index(h)]; + resv_map->pages_per_hpage = pages_per_huge_page(h); + resv_map->css = &h_cg->css; + } +#endif +} + +struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); + + if (!resv_map || !rg) { + kfree(resv_map); + kfree(rg); + return NULL; + } + + kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); + INIT_LIST_HEAD(&resv_map->regions); + init_rwsem(&resv_map->rw_sema); + + resv_map->adds_in_progress = 0; + /* + * Initialize these to 0. On shared mappings, 0's here indicate these + * fields don't do cgroup accounting. On private mappings, these will be + * re-initialized to the proper values, to indicate that hugetlb cgroup + * reservations are to be un-charged from here. + */ + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL); + + INIT_LIST_HEAD(&resv_map->region_cache); + list_add(&rg->link, &resv_map->region_cache); + resv_map->region_cache_count = 1; + + return resv_map; +} + +void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + struct list_head *head = &resv_map->region_cache; + struct file_region *rg, *trg; + + /* Clear out any active regions before we release the map. */ + region_del(resv_map, 0, LONG_MAX); + + /* ... and any entries left in the cache */ + list_for_each_entry_safe(rg, trg, head, link) { + list_del(&rg->link); + kfree(rg); + } + + VM_BUG_ON(resv_map->adds_in_progress); + + kfree(resv_map); +} + +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + /* + * At inode evict time, i_mapping may not point to the original + * address space within the inode. This original address space + * contains the pointer to the resv_map. So, always use the + * address space embedded within the inode. + * The VERY common case is inode->mapping == &inode->i_data but, + * this may not be true for device special inodes. + */ + return (struct resv_map *)(&inode->i_data)->private_data; +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); + } +} + +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + + set_vma_private_data(vma, (unsigned long)map); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + + return (get_vma_private_data(vma) & flag) != 0; +} + +bool __vma_private_lock(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & VM_MAYSHARE) && + get_vma_private_data(vma) & ~HPAGE_RESV_MASK && + is_vma_resv_set(vma, HPAGE_RESV_OWNER); +} + +void hugetlb_dup_vma_private(struct vm_area_struct *vma) +{ + VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + /* + * Clear vm_private_data + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + * Before clearing, make sure pointer is not associated with vma + * as this will leak the structure. This is the case when called + * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already + * been called to allocate a new structure. + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + */ + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock && vma_lock->vma != vma) + vma->vm_private_data = NULL; + } else + vma->vm_private_data = NULL; +} + +/* + * Reset and decrement one ref on hugepage private reservation. + * Called with mm->mmap_sem writer semaphore held. + * This function should be only used by move_vma() and operate on + * same sized vma. It should never come here with last ref on the + * reservation. + */ +void clear_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + /* + * Clear the old hugetlb private page reservation. + * It has already been transferred to new_vma. + * + * During a mremap() operation of a hugetlb vma we call move_vma() + * which copies vma into new_vma and unmaps vma. After the copy + * operation both new_vma and vma share a reference to the resv_map + * struct, and at that point vma is about to be unmapped. We don't + * want to return the reservation to the pool at unmap of vma because + * the reservation still lives on in new_vma, so simply decrement the + * ref here and remove the resv_map reference from this vma. + */ + struct resv_map *reservations = vma_resv_map(vma); + + if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv_map_put_hugetlb_cgroup_uncharge_info(reservations); + kref_put(&reservations->refs, resv_map_release); + } + + hugetlb_dup_vma_private(vma); +} + +/* Returns true if the VMA has associated reserve pages */ +static bool vma_has_reserves(struct vm_area_struct *vma, long chg) +{ + if (vma->vm_flags & VM_NORESERVE) { + /* + * This address is already reserved by other process(chg == 0), + * so, we should decrement reserved count. Without decrementing, + * reserve count remains after releasing inode, because this + * allocated page will go into page cache and is regarded as + * coming from reserved pool in releasing step. Currently, we + * don't have any other solution to deal with this situation + * properly, so add work-around here. + */ + if (vma->vm_flags & VM_MAYSHARE && chg == 0) + return true; + else + return false; + } + + /* Shared mappings always use reserves */ + if (vma->vm_flags & VM_MAYSHARE) { + /* + * We know VM_NORESERVE is not set. Therefore, there SHOULD + * be a region map for all pages. The only situation where + * there is no region map is if a hole was punched via + * fallocate. In this case, there really are no reserves to + * use. This situation is indicated if chg != 0. + */ + if (chg) + return false; + else + return true; + } + + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + /* + * Like the shared case above, a hole punch or truncate + * could have been performed on the private mapping. + * Examine the value of chg to determine if reserves + * actually exist or were previously consumed. + * Very Subtle - The value of chg comes from a previous + * call to vma_needs_reserves(). The reserve map for + * private mappings has different (opposite) semantics + * than that of shared mappings. vma_needs_reserves() + * has already taken this difference in semantics into + * account. Therefore, the meaning of chg is the same + * as in the shared case above. Code could easily be + * combined, but keeping it separate draws attention to + * subtle differences. + */ + if (chg) + return false; + else + return true; + } + + return false; +} + +static void enqueue_huge_page(struct hstate *h, struct page *page) +{ + int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); + VM_BUG_ON_PAGE(page_count(page), page); + + list_move(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; + SetHPageFreed(page); +} + +static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) +{ + struct page *page; + bool pin = !!(current->flags & PF_MEMALLOC_PIN); + + lockdep_assert_held(&hugetlb_lock); + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { + if (pin && !is_longterm_pinnable_page(page)) + continue; + + if (PageHWPoison(page)) + continue; + + list_move(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); + ClearHPageFreed(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; + } + + return NULL; +} + +static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, + nodemask_t *nmask) +{ + unsigned int cpuset_mems_cookie; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + int node = NUMA_NO_NODE; + + zonelist = node_zonelist(nid, gfp_mask); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { + struct page *page; + + if (!cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * no need to ask again on the same node. Pool is node rather than + * zone aware + */ + if (zone_to_nid(zone) == node) + continue; + node = zone_to_nid(zone); + + page = dequeue_huge_page_node_exact(h, node); + if (page) + return page; + } + if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + + return NULL; +} + +static unsigned long available_huge_pages(struct hstate *h) +{ + return h->free_huge_pages - h->resv_huge_pages; +} + +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address, int avoid_reserve, + long chg) +{ + struct page *page = NULL; + struct mempolicy *mpol; + gfp_t gfp_mask; + nodemask_t *nodemask; + int nid; + + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) + goto err; + + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && !available_huge_pages(h)) + goto err; + + gfp_mask = htlb_alloc_mask(h); + nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + + if (mpol_is_preferred_many(mpol)) { + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { + SetHPageRestoreReserve(page); + h->resv_huge_pages--; + } + + mpol_cond_put(mpol); + return page; + +err: + return NULL; +} + +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node_in(nid, *nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for remove_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +/* used to demote non-gigantic_huge pages as well */ +static void __destroy_compound_gigantic_page(struct page *page, + unsigned int order, bool demote) +{ + int i; + int nr_pages = 1 << order; + struct page *p; + + atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); + + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); + p->mapping = NULL; + clear_compound_head(p); + if (!demote) + set_page_refcounted(p); + } + + set_compound_order(page, 0); +#ifdef CONFIG_64BIT + page[1].compound_nr = 0; +#endif + __ClearPageHead(page); +} + +static void destroy_compound_hugetlb_page_for_demote(struct page *page, + unsigned int order) +{ + __destroy_compound_gigantic_page(page, order, true); +} + +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static void destroy_compound_gigantic_page(struct page *page, + unsigned int order) +{ + __destroy_compound_gigantic_page(page, order, false); +} + +static void free_gigantic_page(struct page *page, unsigned int order) +{ + /* + * If the page isn't allocated using the cma allocator, + * cma_release() returns false. + */ +#ifdef CONFIG_CMA + if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) + return; +#endif + + free_contig_range(page_to_pfn(page), 1 << order); +} + +#ifdef CONFIG_CONTIG_ALLOC +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + unsigned long nr_pages = pages_per_huge_page(h); + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + +#ifdef CONFIG_CMA + { + struct page *page; + int node; + + if (hugetlb_cma[nid]) { + page = cma_alloc(hugetlb_cma[nid], nr_pages, + huge_page_order(h), true); + if (page) + return page; + } + + if (!(gfp_mask & __GFP_THISNODE)) { + for_each_node_mask(node, *nodemask) { + if (node == nid || !hugetlb_cma[node]) + continue; + + page = cma_alloc(hugetlb_cma[node], nr_pages, + huge_page_order(h), true); + if (page) + return page; + } + } + } +#endif + + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); +} + +#else /* !CONFIG_CONTIG_ALLOC */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ + +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +static inline void free_gigantic_page(struct page *page, unsigned int order) { } +static inline void destroy_compound_gigantic_page(struct page *page, + unsigned int order) { } +#endif + +static inline void __clear_hugetlb_destructor(struct hstate *h, + struct page *page) +{ + lockdep_assert_held(&hugetlb_lock); + + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not + * apply. + * + */ + if (hstate_is_gigantic(h)) + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + else + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); +} + +/* + * Remove hugetlb page from lists. + * If vmemmap exists for the page, update dtor so that the page appears + * as just a compound page. Otherwise, wait until after allocating vmemmap + * to update dtor. + * + * A reference is held on the page, except in the case of demote. + * + * Must be called with hugetlb lock held. + */ +static void __remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus, + bool demote) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + lockdep_assert_held(&hugetlb_lock); + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + /* + * We can only clear the hugetlb destructor after allocating vmemmap + * pages. Otherwise, someone (memory error handling) may try to write + * to tail struct pages. + */ + if (!HPageVmemmapOptimized(page)) + __clear_hugetlb_destructor(h, page); + + /* + * In the case of demote we do not ref count the page as it will soon + * be turned into a page of smaller size. + */ + if (!demote) + set_page_refcounted(page); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + __remove_hugetlb_page(h, page, adjust_surplus, false); +} + +static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + __remove_hugetlb_page(h, page, adjust_surplus, true); +} + +static void add_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int zeroed; + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + + lockdep_assert_held(&hugetlb_lock); + + INIT_LIST_HEAD(&page->lru); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + + if (adjust_surplus) { + h->surplus_huge_pages++; + h->surplus_huge_pages_node[nid]++; + } + + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_private(page, 0); + /* + * We have to set HPageVmemmapOptimized again as above + * set_page_private(page, 0) cleared it. + */ + SetHPageVmemmapOptimized(page); + + /* + * This page is about to be managed by the hugetlb allocator and + * should have no users. Drop our reference, and check for others + * just in case. + */ + zeroed = put_page_testzero(page); + if (!zeroed) + /* + * It is VERY unlikely soneone else has taken a ref on + * the page. In this case, we simply return as the + * hugetlb destructor (free_huge_page) will be called + * when this other ref is dropped. + */ + return; + + arch_clear_hugepage_flags(page); + enqueue_huge_page(h, page); +} + +static void __update_and_free_page(struct hstate *h, struct page *page) +{ + int i; + struct page *subpage; + bool clear_dtor = HPageVmemmapOptimized(page); + + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + /* + * If we don't know which subpages are hwpoisoned, we can't free + * the hugepage, so it's leaked intentionally. + */ + if (HPageRawHwpUnreliable(page)) + return; + + if (hugetlb_vmemmap_restore(h, page)) { + spin_lock_irq(&hugetlb_lock); + /* + * If we cannot allocate vmemmap pages, just refuse to free the + * page and put the page back on the hugetlb free list and treat + * as a surplus page. + */ + add_hugetlb_page(h, page, true); + spin_unlock_irq(&hugetlb_lock); + return; + } + + /* + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. + */ + if (unlikely(PageHWPoison(page))) + hugetlb_clear_page_hwpoison(page); + + /* + * If vmemmap pages were allocated above, then we need to clear the + * hugetlb destructor under the hugetlb lock. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, page); + spin_unlock_irq(&hugetlb_lock); + } + + for (i = 0; i < pages_per_huge_page(h); i++) { + subpage = nth_page(page, i); + subpage->flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); + } + + /* + * Non-gigantic pages demoted from CMA allocated gigantic pages + * need to be given back to CMA in free_gigantic_page. + */ + if (hstate_is_gigantic(h) || + hugetlb_cma_page(page, huge_page_order(h))) { + destroy_compound_gigantic_page(page, huge_page_order(h)); + free_gigantic_page(page, huge_page_order(h)); + } else { + __free_pages(page, huge_page_order(h)); + } +} + +/* + * As update_and_free_page() can be called under any context, so we cannot + * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the + * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate + * the vmemmap pages. + * + * free_hpage_workfn() locklessly retrieves the linked list of pages to be + * freed and frees them one-by-one. As the page->mapping pointer is going + * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node + * structure of a lockless linked list of huge pages to be freed. + */ +static LLIST_HEAD(hpage_freelist); + +static void free_hpage_workfn(struct work_struct *work) +{ + struct llist_node *node; + + node = llist_del_all(&hpage_freelist); + + while (node) { + struct page *page; + struct hstate *h; + + page = container_of((struct address_space **)node, + struct page, mapping); + node = node->next; + page->mapping = NULL; + /* + * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() + * is going to trigger because a previous call to + * remove_hugetlb_page() will set_compound_page_dtor(page, + * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + */ + h = size_to_hstate(page_size(page)); + + __update_and_free_page(h, page); + + cond_resched(); + } +} +static DECLARE_WORK(free_hpage_work, free_hpage_workfn); + +static inline void flush_free_hpage_work(struct hstate *h) +{ + if (hugetlb_vmemmap_optimizable(h)) + flush_work(&free_hpage_work); +} + +static void update_and_free_page(struct hstate *h, struct page *page, + bool atomic) +{ + if (!HPageVmemmapOptimized(page) || !atomic) { + __update_and_free_page(h, page); + return; + } + + /* + * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages. + * + * Only call schedule_work() if hpage_freelist is previously + * empty. Otherwise, schedule_work() had been called but the workfn + * hasn't retrieved the list yet. + */ + if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + schedule_work(&free_hpage_work); +} + +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page, false); + cond_resched(); + } +} + +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; +} + +void free_huge_page(struct page *page) +{ + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + struct hugepage_subpool *spool = hugetlb_page_subpool(page); + bool restore_reserve; + unsigned long flags; + + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(page_mapcount(page), page); + + hugetlb_set_page_subpool(page, NULL); + if (PageAnon(page)) + __ClearPageAnonExclusive(page); + page->mapping = NULL; + restore_reserve = HPageRestoreReserve(page); + ClearHPageRestoreReserve(page); + + /* + * If HPageRestoreReserve was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservation, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. + */ + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + } + + spin_lock_irqsave(&hugetlb_lock, flags); + ClearHPageMigratable(page); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), + pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + + if (HPageTemporary(page)) { + remove_hugetlb_page(h, page, false); + spin_unlock_irqrestore(&hugetlb_lock, flags); + update_and_free_page(h, page, true); + } else if (h->surplus_huge_pages_node[nid]) { + /* remove the page from active list */ + remove_hugetlb_page(h, page, true); + spin_unlock_irqrestore(&hugetlb_lock, flags); + update_and_free_page(h, page, true); + } else { + arch_clear_hugepage_flags(page); + enqueue_huge_page(h, page); + spin_unlock_irqrestore(&hugetlb_lock, flags); + } +} + +/* + * Must be called with the hugetlb lock held + */ +static void __prep_account_new_huge_page(struct hstate *h, int nid) +{ + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; +} + +static void __prep_new_huge_page(struct hstate *h, struct page *page) +{ + hugetlb_vmemmap_optimize(h, page); + INIT_LIST_HEAD(&page->lru); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + hugetlb_set_page_subpool(page, NULL); + set_hugetlb_cgroup(page, NULL); + set_hugetlb_cgroup_rsvd(page, NULL); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(h, page); + spin_lock_irq(&hugetlb_lock); + __prep_account_new_huge_page(h, nid); + spin_unlock_irq(&hugetlb_lock); +} + +static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, + bool demote) +{ + int i, j; + int nr_pages = 1 << order; + struct page *p; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __ClearPageReserved(page); + __SetPageHead(page); + for (i = 0; i < nr_pages; i++) { + p = nth_page(page, i); + + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwise drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + if (i != 0) /* head page cleared above */ + __ClearPageReserved(p); + /* + * Subtle and very unlikely + * + * Gigantic 'page allocators' such as memblock or cma will + * return a set of pages with each page ref counted. We need + * to turn this set of pages into a compound page with tail + * page ref counts set to zero. Code such as speculative page + * cache adding could take a ref on a 'to be' tail page. + * We need to respect any increased ref count, and only set + * the ref count to zero if count is currently 1. If count + * is not 1, we return an error. An error return indicates + * the set of pages can not be converted to a gigantic page. + * The caller who allocated the pages should then discard the + * pages using the appropriate free interface. + * + * In the case of demote, the ref count will be zero. + */ + if (!demote) { + if (!page_ref_freeze(p, 1)) { + pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); + goto out_error; + } + } else { + VM_BUG_ON_PAGE(page_count(p), p); + } + if (i != 0) + set_compound_head(p, page); + } + atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(compound_pincount_ptr(page), 0); + return true; + +out_error: + /* undo page modifications made above */ + for (j = 0; j < i; j++) { + p = nth_page(page, j); + if (j != 0) + clear_compound_head(p); + set_page_refcounted(p); + } + /* need to clear PG_reserved on remaining tail pages */ + for (; j < nr_pages; j++) { + p = nth_page(page, j); + __ClearPageReserved(p); + } + set_compound_order(page, 0); +#ifdef CONFIG_64BIT + page[1].compound_nr = 0; +#endif + __ClearPageHead(page); + return false; +} + +static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +{ + return __prep_compound_gigantic_page(page, order, false); +} + +static bool prep_compound_gigantic_page_for_demote(struct page *page, + unsigned int order) +{ + return __prep_compound_gigantic_page(page, order, true); +} + +/* + * PageHuge() only returns true for hugetlbfs pages, but not for normal or + * transparent huge pages. See the PageTransHuge() documentation for more + * details. + */ +int PageHuge(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return page[1].compound_dtor == HUGETLB_PAGE_DTOR; +} +EXPORT_SYMBOL_GPL(PageHuge); + +/* + * PageHeadHuge() only returns true for hugetlbfs head page, but not for + * normal or transparent huge pages. + */ +int PageHeadHuge(struct page *page_head) +{ + if (!PageHead(page_head)) + return 0; + + return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR; +} +EXPORT_SYMBOL_GPL(PageHeadHuge); + +/* + * Find and lock address space (mapping) in write mode. + * + * Upon entry, the page is locked which means that page_mapping() is + * stable. Due to locking order, we can only trylock_write. If we can + * not get the lock, simply return NULL to caller. + */ +struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +{ + struct address_space *mapping = page_mapping(hpage); + + if (!mapping) + return mapping; + + if (i_mmap_trylock_write(mapping)) + return mapping; + + return NULL; +} + +pgoff_t hugetlb_basepage_index(struct page *page) +{ + struct page *page_head = compound_head(page); + pgoff_t index = page_index(page_head); + unsigned long compound_idx; + + if (compound_order(page_head) >= MAX_ORDER) + compound_idx = page_to_pfn(page) - page_to_pfn(page_head); + else + compound_idx = page - page_head; + + return (index << compound_order(page_head)) + compound_idx; +} + +static struct page *alloc_buddy_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) +{ + int order = huge_page_order(h); + struct page *page; + bool alloc_try_hard = true; + bool retry = true; + + /* + * By default we always try hard to allocate the page with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * a loop (to adjust global huge page counts) and previous allocation + * failed, do not continue to try hard on the same node. Use the + * node_alloc_noretry bitmap to manage this state information. + */ + if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) + alloc_try_hard = false; + gfp_mask |= __GFP_COMP|__GFP_NOWARN; + if (alloc_try_hard) + gfp_mask |= __GFP_RETRY_MAYFAIL; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); +retry: + page = __alloc_pages(gfp_mask, order, nid, nmask); + + /* Freeze head page */ + if (page && !page_ref_freeze(page, 1)) { + __free_pages(page, order); + if (retry) { /* retry once */ + retry = false; + goto retry; + } + /* WOW! twice in a row. */ + pr_warn("HugeTLB head page unexpected inflated ref count\n"); + page = NULL; + } + + if (page) + __count_vm_event(HTLB_BUDDY_PGALLOC); + else + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + /* + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this + * indicates an overall state change. Clear bit so that we resume + * normal 'try hard' allocations. + */ + if (node_alloc_noretry && page && !alloc_try_hard) + node_clear(nid, *node_alloc_noretry); + + /* + * If we tried hard to get a page but failed, set bit so that + * subsequent attempts will not try as hard until there is an + * overall state change. + */ + if (node_alloc_noretry && !page && alloc_try_hard) + node_set(nid, *node_alloc_noretry); + + return page; +} + +/* + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. + */ +static struct page *alloc_fresh_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) +{ + struct page *page; + bool retry = false; + +retry: + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + else + page = alloc_buddy_huge_page(h, gfp_mask, + nid, nmask, node_alloc_noretry); + if (!page) + return NULL; + + if (hstate_is_gigantic(h)) { + if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + /* + * Rare failure to convert pages to compound page. + * Free pages and try again - ONCE! + */ + free_gigantic_page(page, huge_page_order(h)); + if (!retry) { + retry = true; + goto retry; + } + return NULL; + } + } + prep_new_huge_page(h, page, page_to_nid(page)); + + return page; +} + +/* + * Allocates a fresh page to the hugetlb allocator pool in the node interleaved + * manner. + */ +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) +{ + struct page *page; + int nr_nodes, node; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, + node_alloc_noretry); + if (page) + break; + } + + if (!page) + return 0; + + free_huge_page(page); /* free it into the hugepage allocator */ + + return 1; +} + +/* + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. + * Called with hugetlb_lock locked. + */ +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) +{ + int nr_nodes, node; + struct page *page = NULL; + + lockdep_assert_held(&hugetlb_lock); + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[node]) && + !list_empty(&h->hugepage_freelists[node])) { + page = list_entry(h->hugepage_freelists[node].next, + struct page, lru); + remove_hugetlb_page(h, page, acct_surplus); + break; + } + } + + return page; +} + +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use hugepages and non-hugepages. + * This function returns values like below: + * + * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages + * when the system is under memory pressure and the feature of + * freeing unused vmemmap pages associated with each hugetlb page + * is enabled. + * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use + * (allocated or reserved.) + * 0: successfully dissolved free hugepages or the page is not a + * hugepage (considered as already dissolved) + */ +int dissolve_free_huge_page(struct page *page) +{ + int rc = -EBUSY; + +retry: + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!PageHuge(page)) + return 0; + + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(page)) { + rc = 0; + goto out; + } + + if (!page_count(page)) { + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + if (!available_huge_pages(h)) + goto out; + + /* + * We should make sure that the page is already on the free list + * when it is dissolved. + */ + if (unlikely(!HPageFreed(head))) { + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + + /* + * Theoretically, we should return -EBUSY when we + * encounter this race. In fact, we have a chance + * to successfully dissolve the page if we do a + * retry. Because the race window is quite small. + * If we seize this opportunity, it is an optimization + * for increasing the success rate of dissolving page. + */ + goto retry; + } + + remove_hugetlb_page(h, head, false); + h->max_huge_pages--; + spin_unlock_irq(&hugetlb_lock); + + /* + * Normally update_and_free_page will allocate required vmemmmap + * before freeing the page. update_and_free_page will fail to + * free the page if it can not allocate required vmemmap. We + * need to adjust max_huge_pages if the page is not freed. + * Attempt to allocate vmemmmap here so that we can take + * appropriate action on failure. + */ + rc = hugetlb_vmemmap_restore(h, head); + if (!rc) { + update_and_free_page(h, head, false); + } else { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_page(h, head, false); + h->max_huge_pages++; + spin_unlock_irq(&hugetlb_lock); + } + + return rc; + } +out: + spin_unlock_irq(&hugetlb_lock); + return rc; +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. + * Also note that if dissolve_free_huge_page() returns with an error, all + * free hugepages that were dissolved before that error are lost. + */ +int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page; + int rc = 0; + unsigned int order; + struct hstate *h; + + if (!hugepages_supported()) + return rc; + + order = huge_page_order(&default_hstate); + for_each_hstate(h) + order = min(order, huge_page_order(h)); + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { + page = pfn_to_page(pfn); + rc = dissolve_free_huge_page(page); + if (rc) + break; + } + + return rc; +} + +/* + * Allocates a fresh surplus page from the page allocator. + */ +static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) +{ + struct page *page = NULL; + + if (hstate_is_gigantic(h)) + return NULL; + + spin_lock_irq(&hugetlb_lock); + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) + goto out_unlock; + spin_unlock_irq(&hugetlb_lock); + + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); + if (!page) + return NULL; + + spin_lock_irq(&hugetlb_lock); + /* + * We could have raced with the pool size change. + * Double check that and simply deallocate the new page + * if we would end up overcommiting the surpluses. Abuse + * temporary page to workaround the nasty free_huge_page + * codeflow + */ + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { + SetHPageTemporary(page); + spin_unlock_irq(&hugetlb_lock); + free_huge_page(page); + return NULL; + } + + h->surplus_huge_pages++; + h->surplus_huge_pages_node[page_to_nid(page)]++; + +out_unlock: + spin_unlock_irq(&hugetlb_lock); + + return page; +} + +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + return NULL; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); + if (!page) + return NULL; + + /* fresh huge pages are frozen */ + set_page_refcounted(page); + + /* + * We do not account these pages as surplus because they are only + * temporary and will be released properly on the last reference + */ + SetHPageTemporary(page); + + return page; +} + +/* + * Use the VMA's mpolicy to allocate a huge page from the buddy. + */ +static +struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page = NULL; + struct mempolicy *mpol; + gfp_t gfp_mask = htlb_alloc_mask(h); + int nid; + nodemask_t *nodemask; + + nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); + if (mpol_is_preferred_many(mpol)) { + gfp_t gfp = gfp_mask | __GFP_NOWARN; + + gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask); + + /* Fallback to all nodes if page==NULL */ + nodemask = NULL; + } + + if (!page) + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); + return page; +} + +/* page migration callback function */ +struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, + nodemask_t *nmask, gfp_t gfp_mask) +{ + spin_lock_irq(&hugetlb_lock); + if (available_huge_pages(h)) { + struct page *page; + + page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); + if (page) { + spin_unlock_irq(&hugetlb_lock); + return page; + } + } + spin_unlock_irq(&hugetlb_lock); + + return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); +} + +/* mempolicy aware migration callback */ +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct page *page; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return page; +} + +/* + * Increase the hugetlb pool such that it can accommodate a reservation + * of size 'delta'. + */ +static int gather_surplus_pages(struct hstate *h, long delta) + __must_hold(&hugetlb_lock) +{ + LIST_HEAD(surplus_list); + struct page *page, *tmp; + int ret; + long i; + long needed, allocated; + bool alloc_ok = true; + + lockdep_assert_held(&hugetlb_lock); + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; + if (needed <= 0) { + h->resv_huge_pages += delta; + return 0; + } + + allocated = 0; + + ret = -ENOMEM; +retry: + spin_unlock_irq(&hugetlb_lock); + for (i = 0; i < needed; i++) { + page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), + NUMA_NO_NODE, NULL); + if (!page) { + alloc_ok = false; + break; + } + list_add(&page->lru, &surplus_list); + cond_resched(); + } + allocated += i; + + /* + * After retaking hugetlb_lock, we need to recalculate 'needed' + * because either resv_huge_pages or free_huge_pages may have changed. + */ + spin_lock_irq(&hugetlb_lock); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); + if (needed > 0) { + if (alloc_ok) + goto retry; + /* + * We were not able to allocate enough pages to + * satisfy the entire reservation so we free what + * we've allocated so far. + */ + goto free; + } + /* + * The surplus_list now contains _at_least_ the number of extra pages + * needed to accommodate the reservation. Add the appropriate number + * of pages to the hugetlb pool and free the extras back to the buddy + * allocator. Commit the entire reservation here to prevent another + * process from stealing the pages as they are added to the pool but + * before they are reserved. + */ + needed += allocated; + h->resv_huge_pages += delta; + ret = 0; + + /* Free the needed pages to the hugetlb pool */ + list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + if ((--needed) < 0) + break; + /* Add the page to the hugetlb allocator */ + enqueue_huge_page(h, page); + } +free: + spin_unlock_irq(&hugetlb_lock); + + /* + * Free unnecessary surplus pages to the buddy allocator. + * Pages have no ref count, call free_huge_page directly. + */ + list_for_each_entry_safe(page, tmp, &surplus_list, lru) + free_huge_page(page); + spin_lock_irq(&hugetlb_lock); + + return ret; +} + +/* + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + */ +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) +{ + unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + lockdep_assert_held(&hugetlb_lock); + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; + + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + goto out; + + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); + + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes with memory. Iterate across these nodes + * until we can no longer free unreserved surplus pages. This occurs + * when the nodes with surplus pages have no free pages. + * remove_pool_huge_page() will balance the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. + */ + while (nr_pages--) { + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) + goto out; + + list_add(&page->lru, &page_list); + } + +out: + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); +} + + +/* + * vma_needs_reservation, vma_commit_reservation and vma_end_reservation + * are used by the huge page allocation routines to manage reservations. + * + * vma_needs_reservation is called to determine if the huge page at addr + * within the vma has an associated reservation. If a reservation is + * needed, the value 1 is returned. The caller is then responsible for + * managing the global reservation and subpool usage counts. After + * the huge page has been allocated, vma_commit_reservation is called + * to add the page to the reservation map. If the page allocation fails, + * the reservation must be ended instead of committed. vma_end_reservation + * is called in such cases. + * + * In the normal case, vma_commit_reservation returns the same value + * as the preceding vma_needs_reservation call. The only time this + * is not the case is if a reserve map was changed between calls. It + * is the responsibility of the caller to notice the difference and + * take appropriate action. + * + * vma_add_reservation is used in error paths where a reservation must + * be restored when a newly allocated huge page must be freed. It is + * to be called after calling vma_needs_reservation to determine if a + * reservation exists. + * + * vma_del_reservation is used in error paths where an entry in the reserve + * map was created during huge page allocation and must be removed. It is to + * be called after calling vma_needs_reservation to determine if a reservation + * exists. + */ +enum vma_resv_mode { + VMA_NEEDS_RESV, + VMA_COMMIT_RESV, + VMA_END_RESV, + VMA_ADD_RESV, + VMA_DEL_RESV, +}; +static long __vma_reservation_common(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, + enum vma_resv_mode mode) +{ + struct resv_map *resv; + pgoff_t idx; + long ret; + long dummy_out_regions_needed; + + resv = vma_resv_map(vma); + if (!resv) + return 1; + + idx = vma_hugecache_offset(h, vma, addr); + switch (mode) { + case VMA_NEEDS_RESV: + ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed); + /* We assume that vma_reservation_* routines always operate on + * 1 page, and that adding to resv map a 1 page entry can only + * ever require 1 region. + */ + VM_BUG_ON(dummy_out_regions_needed != 1); + break; + case VMA_COMMIT_RESV: + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + break; + case VMA_END_RESV: + region_abort(resv, idx, idx + 1, 1); + ret = 0; + break; + case VMA_ADD_RESV: + if (vma->vm_flags & VM_MAYSHARE) { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } else { + region_abort(resv, idx, idx + 1, 1); + ret = region_del(resv, idx, idx + 1); + } + break; + case VMA_DEL_RESV: + if (vma->vm_flags & VM_MAYSHARE) { + region_abort(resv, idx, idx + 1, 1); + ret = region_del(resv, idx, idx + 1); + } else { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } + break; + default: + BUG(); + } + + if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) + return ret; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; +} + +static long vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); +} + +static long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); +} + +static void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); +} + +static long vma_add_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); +} + +static long vma_del_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); +} + +/* + * This routine is called to restore reservation information on error paths. + * It should ONLY be called for pages allocated via alloc_huge_page(), and + * the hugetlb mutex should remain held when calling this routine. + * + * It handles two specific cases: + * 1) A reservation was in place and the page consumed the reservation. + * HPageRestoreReserve is set in the page. + * 2) No reservation was in place for the page, so HPageRestoreReserve is + * not set. However, alloc_huge_page always updates the reserve map. + * + * In case 1, free_huge_page later in the error path will increment the + * global reserve count. But, free_huge_page does not have enough context + * to adjust the reservation map. This case deals primarily with private + * mappings. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve map indicates there is a reservation present. + * + * In case 2, simply undo reserve map modifications done by alloc_huge_page. + */ +void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, + unsigned long address, struct page *page) +{ + long rc = vma_needs_reservation(h, vma, address); + + if (HPageRestoreReserve(page)) { + if (unlikely(rc < 0)) + /* + * Rare out of memory condition in reserve map + * manipulation. Clear HPageRestoreReserve so that + * global reserve count will not be incremented + * by free_huge_page. This will make it appear + * as though the reservation for this page was + * consumed. This may prevent the task from + * faulting in the page at a later time. This + * is better than inconsistent global huge page + * accounting of reserve counts. + */ + ClearHPageRestoreReserve(page); + else if (rc) + (void)vma_add_reservation(h, vma, address); + else + vma_end_reservation(h, vma, address); + } else { + if (!rc) { + /* + * This indicates there is an entry in the reserve map + * not added by alloc_huge_page. We know it was added + * before the alloc_huge_page call, otherwise + * HPageRestoreReserve would be set on the page. + * Remove the entry so that a subsequent allocation + * does not consume a reservation. + */ + rc = vma_del_reservation(h, vma, address); + if (rc < 0) + /* + * VERY rare out of memory condition. Since + * we can not delete the entry, set + * HPageRestoreReserve so that the reserve + * count will be incremented when the page + * is freed. This reserve will be consumed + * on a subsequent allocation. + */ + SetHPageRestoreReserve(page); + } else if (rc < 0) { + /* + * Rare out of memory condition from + * vma_needs_reservation call. Memory allocation is + * only attempted if a new entry is needed. Therefore, + * this implies there is not an entry in the + * reserve map. + * + * For shared mappings, no entry in the map indicates + * no reservation. We are done. + */ + if (!(vma->vm_flags & VM_MAYSHARE)) + /* + * For private mappings, no entry indicates + * a reservation is present. Since we can + * not add an entry, set SetHPageRestoreReserve + * on the page so reserve count will be + * incremented when freed. This reserve will + * be consumed on a subsequent allocation. + */ + SetHPageRestoreReserve(page); + } else + /* + * No reservation present, do nothing + */ + vma_end_reservation(h, vma, address); + } +} + +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Here, we allocate the page and 'prep' it + * by doing everything but actually updating counters and adding to + * the pool. This simplifies and let us do most of the processing + * under the lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + __prep_new_huge_page(h, new_page); + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. + */ + spin_unlock_irq(&hugetlb_lock); + ret = isolate_hugetlb(old_page, list); + spin_lock_irq(&hugetlb_lock); + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * Ref count on new page is already zero as it was dropped + * earlier. It can be directly added to the pool free list. + */ + __prep_account_new_huge_page(h, nid); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page, false); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + /* Page has a zero ref count, but needs a ref to be freed */ + set_page_refcounted(new_page); + update_and_free_page(h, new_page, false); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +{ + struct hstate *h; + struct page *head; + int ret = -EBUSY; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + if (page_count(head) && !isolate_hugetlb(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; +} + +struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct hugepage_subpool *spool = subpool_vma(vma); + struct hstate *h = hstate_vma(vma); + struct page *page; + long map_chg, map_commit; + long gbl_chg; + int ret, idx; + struct hugetlb_cgroup *h_cg; + bool deferred_reserve; + + idx = hstate_index(h); + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); + if (map_chg < 0) + return ERR_PTR(-ENOMEM); + + /* + * Processes that did not create the mapping will have no + * reserves as indicated by the region/reserve map. Check + * that the allocation will not exceed the subpool limit. + * Allocations for MAP_NORESERVE mappings also need to be + * checked against any subpool limit. + */ + if (map_chg || avoid_reserve) { + gbl_chg = hugepage_subpool_get_pages(spool, 1); + if (gbl_chg < 0) { + vma_end_reservation(h, vma, addr); + return ERR_PTR(-ENOSPC); + } + + /* + * Even though there was no reservation in the region/reserve + * map, there could be reservations associated with the + * subpool that can be used. This would be indicated if the + * return value of hugepage_subpool_get_pages() is zero. + * However, if avoid_reserve is specified we still avoid even + * the subpool reservations. + */ + if (avoid_reserve) + gbl_chg = 1; + } + + /* If this allocation is not consuming a reservation, charge it now. + */ + deferred_reserve = map_chg || avoid_reserve; + if (deferred_reserve) { + ret = hugetlb_cgroup_charge_cgroup_rsvd( + idx, pages_per_huge_page(h), &h_cg); + if (ret) + goto out_subpool_put; + } + + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); + if (ret) + goto out_uncharge_cgroup_reservation; + + spin_lock_irq(&hugetlb_lock); + /* + * glb_chg is passed to indicate whether or not a page must be taken + * from the global free pool (global change). gbl_chg == 0 indicates + * a reservation exists for the allocation. + */ + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!page) { + spin_unlock_irq(&hugetlb_lock); + page = alloc_buddy_huge_page_with_mpol(h, vma, addr); + if (!page) + goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + SetHPageRestoreReserve(page); + h->resv_huge_pages--; + } + list_add(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); + /* Fall through */ + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + /* If allocation is not consuming a reservation, also store the + * hugetlb_cgroup pointer on the page. + */ + if (deferred_reserve) { + hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), + h_cg, page); + } + + spin_unlock_irq(&hugetlb_lock); + + hugetlb_set_page_subpool(page, spool); + + map_commit = vma_commit_reservation(h, vma, addr); + if (unlikely(map_chg > map_commit)) { + /* + * The page was added to the reservation map between + * vma_needs_reservation and vma_commit_reservation. + * This indicates a race with hugetlb_reserve_pages. + * Adjust for the subpool count incremented above AND + * in hugetlb_reserve_pages for the same page. Also, + * the reservation count added in hugetlb_reserve_pages + * no longer applies. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + if (deferred_reserve) + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), + pages_per_huge_page(h), page); + } + return page; + +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_uncharge_cgroup_reservation: + if (deferred_reserve) + hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), + h_cg); +out_subpool_put: + if (map_chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + vma_end_reservation(h, vma, addr); + return ERR_PTR(-ENOSPC); +} + +int alloc_bootmem_huge_page(struct hstate *h, int nid) + __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); +int __alloc_bootmem_huge_page(struct hstate *h, int nid) +{ + struct huge_bootmem_page *m = NULL; /* initialize for clang */ + int nr_nodes, node; + + /* do node specific alloc */ + if (nid != NUMA_NO_NODE) { + m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!m) + return 0; + goto found; + } + /* allocate from next node when distributing huge pages */ + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { + m = memblock_alloc_try_nid_raw( + huge_page_size(h), huge_page_size(h), + 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + if (!m) + return 0; + goto found; + } + +found: + /* Put them into a private list first because mem_map is not up yet */ + INIT_LIST_HEAD(&m->list); + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; +} + +/* + * Put bootmem huge pages into the standard lists after mem_map is up. + * Note: This only applies to gigantic (order > MAX_ORDER) pages. + */ +static void __init gather_bootmem_prealloc(void) +{ + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + + VM_BUG_ON(!hstate_is_gigantic(h)); + WARN_ON(page_count(page) != 1); + if (prep_compound_gigantic_page(page, huge_page_order(h))) { + WARN_ON(PageReserved(page)); + prep_new_huge_page(h, page, page_to_nid(page)); + free_huge_page(page); /* add to the hugepage allocator */ + } else { + /* VERY unlikely inflated ref count on a tail page */ + free_gigantic_page(page, huge_page_order(h)); + } + + /* + * We need to restore the 'stolen' pages to totalram_pages + * in order to fix confusing memory reports from free(1) and + * other side-effects, like CommitLimit going negative. + */ + adjust_managed_page_count(page, pages_per_huge_page(h)); + cond_resched(); + } +} +static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) +{ + unsigned long i; + char buf[32]; + + for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { + if (hstate_is_gigantic(h)) { + if (!alloc_bootmem_huge_page(h, nid)) + break; + } else { + struct page *page; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); + if (!page) + break; + free_huge_page(page); /* free it into the hugepage allocator */ + } + cond_resched(); + } + if (i == h->max_huge_pages_node[nid]) + return; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", + h->max_huge_pages_node[nid], buf, nid, i); + h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); + h->max_huge_pages_node[nid] = i; +} + +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) +{ + unsigned long i; + nodemask_t *node_alloc_noretry; + bool node_specific_alloc = false; + + /* skip gigantic hugepages allocation if hugetlb_cma enabled */ + if (hstate_is_gigantic(h) && hugetlb_cma_size) { + pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); + return; + } + + /* do node specific alloc */ + for_each_online_node(i) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + if (node_specific_alloc) + return; + + /* below will do all node balanced alloc */ + if (!hstate_is_gigantic(h)) { + /* + * Bit mask controlling how hard we retry per-node allocations. + * Ignore errors as lower level routines can deal with + * node_alloc_noretry == NULL. If this kmalloc fails at boot + * time, we are likely in bigger trouble. + */ + node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), + GFP_KERNEL); + } else { + /* allocations done at boot time */ + node_alloc_noretry = NULL; + } + + /* bit mask controlling how hard we retry per-node allocations */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); + + for (i = 0; i < h->max_huge_pages; ++i) { + if (hstate_is_gigantic(h)) { + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) + break; + } else if (!alloc_pool_huge_page(h, + &node_states[N_MEMORY], + node_alloc_noretry)) + break; + cond_resched(); + } + if (i < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, i); + h->max_huge_pages = i; + } + kfree(node_alloc_noretry); +} + +static void __init hugetlb_init_hstates(void) +{ + struct hstate *h, *h2; + + for_each_hstate(h) { + /* oversize hugepages were init'ed in early boot */ + if (!hstate_is_gigantic(h)) + hugetlb_hstate_alloc_pages(h); + + /* + * Set demote order for each hstate. Note that + * h->demote_order is initially 0. + * - We can not demote gigantic pages if runtime freeing + * is not supported, so skip this. + * - If CMA allocation is possible, we can not demote + * HUGETLB_PAGE_ORDER or smaller size pages. + */ + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + continue; + if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER) + continue; + for_each_hstate(h2) { + if (h2 == h) + continue; + if (h2->order < h->order && + h2->order > h->demote_order) + h->demote_order = h2->order; + } + } +} + +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", + buf, h->free_huge_pages); + pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", + hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); + } +} + +#ifdef CONFIG_HIGHMEM +static void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) +{ + int i; + LIST_HEAD(page_list); + + lockdep_assert_held(&hugetlb_lock); + if (hstate_is_gigantic(h)) + return; + + /* + * Collect pages to be freed on a list, and free after dropping lock + */ + for_each_node_mask(i, *nodes_allowed) { + struct page *page, *next; + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) + goto out; + if (PageHighMem(page)) + continue; + remove_hugetlb_page(h, page, false); + list_add(&page->lru, &page_list); + } + } + +out: + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); +} +#else +static inline void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) +{ +} +#endif + +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) +{ + int nr_nodes, node; + + lockdep_assert_held(&hugetlb_lock); + VM_BUG_ON(delta != -1 && delta != 1); + + if (delta < 0) { + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node]) + goto found; + } + } else { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node] < + h->nr_huge_pages_node[node]) + goto found; + } + } + return 0; + +found: + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[node] += delta; + return 1; +} + +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) +static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, + nodemask_t *nodes_allowed) +{ + unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); + NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); + + /* + * Bit mask controlling how hard we retry per-node allocations. + * If we can not allocate the bit mask, do not attempt to allocate + * the requested huge pages. + */ + if (node_alloc_noretry) + nodes_clear(*node_alloc_noretry); + else + return -ENOMEM; + + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); + flush_free_hpage_work(h); + spin_lock_irq(&hugetlb_lock); + + /* + * Check for a node specific request. + * Changing node specific huge page count may require a corresponding + * change to the global count. In any case, the passed node mask + * (nodes_allowed) will restrict alloc/free to the specified node. + */ + if (nid != NUMA_NO_NODE) { + unsigned long old_count = count; + + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + /* + * User may have specified a large count value which caused the + * above calculation to overflow. In this case, they wanted + * to allocate as many huge pages as possible. Set count to + * largest possible value to align with their intention. + */ + if (count < old_count) + count = ULONG_MAX; + } + + /* + * Gigantic pages runtime allocation depend on the capability for large + * page range allocation. + * If the system does not provide this feature, return an error when + * the user tries to allocate gigantic pages but let the user free the + * boottime allocated gigantic pages. + */ + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { + if (count > persistent_huge_pages(h)) { + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + NODEMASK_FREE(node_alloc_noretry); + return -EINVAL; + } + /* Fall through to decrease pool */ + } + + /* + * Increase the pool size + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + * + * We might race with alloc_surplus_huge_page() here and be unable + * to convert a surplus huge page to a normal huge page. That is + * not critical, though, it just means the overall size of the + * pool might be one hugepage larger than it needs to be, but + * within all the constraints specified by the sysctls. + */ + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, nodes_allowed, -1)) + break; + } + + while (count > persistent_huge_pages(h)) { + /* + * If this allocation races such that we no longer need the + * page, free_huge_page will handle it by freeing the page + * and reducing the surplus. + */ + spin_unlock_irq(&hugetlb_lock); + + /* yield cpu to avoid soft lockup */ + cond_resched(); + + ret = alloc_pool_huge_page(h, nodes_allowed, + node_alloc_noretry); + spin_lock_irq(&hugetlb_lock); + if (!ret) + goto out; + + /* Bail for signals. Probably ctrl-c from user */ + if (signal_pending(current)) + goto out; + } + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + * + * By placing pages into the surplus state independent of the + * overcommit value, we are allowing the surplus pool size to + * exceed overcommit. There are few sane options here. Since + * alloc_surplus_huge_page() is checking the global counter, + * though, we'll note that we're not allowed to exceed surplus + * and won't grow the pool anywhere else. Not until one of the + * sysctls are changed, or the surplus pages go out of use. + */ + min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; + min_count = max(count, min_count); + try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ + while (min_count < persistent_huge_pages(h)) { + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) + break; + + list_add(&page->lru, &page_list); + } + /* free the pages after dropping lock */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + flush_free_hpage_work(h); + spin_lock_irq(&hugetlb_lock); + + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, nodes_allowed, 1)) + break; + } +out: + h->max_huge_pages = persistent_huge_pages(h); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + + NODEMASK_FREE(node_alloc_noretry); + + return 0; +} + +static int demote_free_huge_page(struct hstate *h, struct page *page) +{ + int i, nid = page_to_nid(page); + struct hstate *target_hstate; + struct page *subpage; + int rc = 0; + + target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); + + remove_hugetlb_page_for_demote(h, page, false); + spin_unlock_irq(&hugetlb_lock); + + rc = hugetlb_vmemmap_restore(h, page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote page */ + spin_lock_irq(&hugetlb_lock); + set_page_refcounted(page); + add_hugetlb_page(h, page, false); + return rc; + } + + /* + * Use destroy_compound_hugetlb_page_for_demote for all huge page + * sizes as it will not ref count pages. + */ + destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + + /* + * Taking target hstate mutex synchronizes with set_max_huge_pages. + * Without the mutex, pages added to target hstate could be marked + * as surplus. + * + * Note that we already hold h->resize_lock. To prevent deadlock, + * use the convention of always taking larger size hstate mutex first. + */ + mutex_lock(&target_hstate->resize_lock); + for (i = 0; i < pages_per_huge_page(h); + i += pages_per_huge_page(target_hstate)) { + subpage = nth_page(page, i); + if (hstate_is_gigantic(target_hstate)) + prep_compound_gigantic_page_for_demote(subpage, + target_hstate->order); + else + prep_compound_page(subpage, target_hstate->order); + set_page_private(subpage, 0); + prep_new_huge_page(target_hstate, subpage, nid); + free_huge_page(subpage); + } + mutex_unlock(&target_hstate->resize_lock); + + spin_lock_irq(&hugetlb_lock); + + /* + * Not absolutely necessary, but for consistency update max_huge_pages + * based on pool changes for the demoted page. + */ + h->max_huge_pages--; + target_hstate->max_huge_pages += + pages_per_huge_page(h) / pages_per_huge_page(target_hstate); + + return rc; +} + +static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) + __must_hold(&hugetlb_lock) +{ + int nr_nodes, node; + struct page *page; + + lockdep_assert_held(&hugetlb_lock); + + /* We should never get here if no demote order */ + if (!h->demote_order) { + pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); + return -EINVAL; /* internal error */ + } + + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + list_for_each_entry(page, &h->hugepage_freelists[node], lru) { + if (PageHWPoison(page)) + continue; + + return demote_free_huge_page(h, page); + } + } + + /* + * Only way to get here is if all pages on free lists are poisoned. + * Return -EBUSY so that caller will not retry. + */ + return -EBUSY; +} + +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR_WO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_WO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RW(_name) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) +{ + int i; + + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; + return &hstates[i]; + } + + return kobj_to_node_hstate(kobj, nidp); +} + +static ssize_t nr_hugepages_show_common(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", nr_huge_pages); +} + +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) +{ + int err; + nodemask_t nodes_allowed, *n_mask; + + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return -EINVAL; + + if (nid == NUMA_NO_NODE) { + /* + * global hstate attribute + */ + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(&nodes_allowed))) + n_mask = &node_states[N_MEMORY]; + else + n_mask = &nodes_allowed; + } else { + /* + * Node specific request. count adjustment happens in + * set_max_huge_pages() after acquiring hugetlb_lock. + */ + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } + + err = set_max_huge_pages(h, count, nid, n_mask); + + return err ? err : len; +} + +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages); + +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); +} + +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj, NULL); + + if (hstate_is_gigantic(h)) + return -EINVAL; + + err = kstrtoul(buf, 10, &input); + if (err) + return err; + + spin_lock_irq(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock_irq(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sysfs_emit(buf, "%lu\n", surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static ssize_t demote_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_demote; + unsigned long nr_available; + nodemask_t nodes_allowed, *n_mask; + struct hstate *h; + int err; + int nid; + + err = kstrtoul(buf, 10, &nr_demote); + if (err) + return err; + h = kobj_to_hstate(kobj, &nid); + + if (nid != NUMA_NO_NODE) { + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } else { + n_mask = &node_states[N_MEMORY]; + } + + /* Synchronize with other sysfs operations modifying huge pages */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); + + while (nr_demote) { + /* + * Check for available pages to demote each time thorough the + * loop as demote_pool_huge_page will drop hugetlb_lock. + */ + if (nid != NUMA_NO_NODE) + nr_available = h->free_huge_pages_node[nid]; + else + nr_available = h->free_huge_pages; + nr_available -= h->resv_huge_pages; + if (!nr_available) + break; + + err = demote_pool_huge_page(h, n_mask); + if (err) + break; + + nr_demote--; + } + + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); + + if (err) + return err; + return len; +} +HSTATE_ATTR_WO(demote); + +static ssize_t demote_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj, NULL); + unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; + + return sysfs_emit(buf, "%lukB\n", demote_size); +} + +static ssize_t demote_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct hstate *h, *demote_hstate; + unsigned long demote_size; + unsigned int demote_order; + + demote_size = (unsigned long)memparse(buf, NULL); + + demote_hstate = size_to_hstate(demote_size); + if (!demote_hstate) + return -EINVAL; + demote_order = demote_hstate->order; + if (demote_order < HUGETLB_PAGE_ORDER) + return -EINVAL; + + /* demote order must be smaller than hstate order */ + h = kobj_to_hstate(kobj, NULL); + if (demote_order >= h->order) + return -EINVAL; + + /* resize_lock synchronizes access to demote size and writes */ + mutex_lock(&h->resize_lock); + h->demote_order = demote_order; + mutex_unlock(&h->resize_lock); + + return count; +} +HSTATE_ATTR(demote_size); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif + NULL, +}; + +static const struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static struct attribute *hstate_demote_attrs[] = { + &demote_size_attr.attr, + &demote_attr.attr, + NULL, +}; + +static const struct attribute_group hstate_demote_attr_group = { + .attrs = hstate_demote_attrs, +}; + +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + const struct attribute_group *hstate_attr_group) +{ + int retval; + int hi = hstate_index(h); + + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); + if (retval) { + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } + + if (h->demote_order) { + retval = sysfs_create_group(hstate_kobjs[hi], + &hstate_demote_attr_group); + if (retval) { + pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); + sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); + kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + return retval; + } + } + + return 0; +} + +#ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node devices in node_devices[] using a parallel array. The array + * index of a node device or _hstate == node id. + * This is here to avoid any static dependency of the node device driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +static struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node devices + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static const struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node device. + * No-op if no hstate attributes attached. + */ +void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) { + int idx = hstate_index(h); + struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; + + if (!hstate_kobj) + continue; + if (h->demote_order) + sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); + sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); + kobject_put(hstate_kobj); + nhs->hstate_kobjs[idx] = NULL; + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + + +/* + * Register hstate attributes for a single node device. + * No-op if attributes already registered. + */ +void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->dev.id]; + int err; + + if (!hugetlb_sysfs_initialized) + return; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->dev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + pr_err("HugeTLB: Unable to add hstate %s for node %d\n", + h->name, node->dev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * devices of nodes that have memory. All on-line nodes should have + * registered their associated device by this time. + */ +static void __init hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_online_node(nid) + hugetlb_register_node(node_devices[nid]); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_register_all_nodes(void) { } + +#endif + +#ifdef CONFIG_CMA +static void __init hugetlb_cma_check(void); +#else +static inline __init void hugetlb_cma_check(void) +{ +} +#endif + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + +static int __init hugetlb_init(void) +{ + int i; + + BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < + __NR_HPAGEFLAGS); + + if (!hugepages_supported()) { + if (hugetlb_max_hstate || default_hstate_max_huge_pages) + pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); + return 0; + } + + /* + * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some + * architectures depend on setup being done here. + */ + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + if (!parsed_default_hugepagesz) { + /* + * If we did not parse a default huge page size, set + * default_hstate_idx to HPAGE_SIZE hstate. And, if the + * number of huge pages for this default size was implicitly + * specified, set that here as well. + * Note that the implicit setting will overwrite an explicit + * setting. A warning will be printed in this case. + */ + default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE)); + if (default_hstate_max_huge_pages) { + if (default_hstate.max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(&default_hstate), + 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", + default_hstate.max_huge_pages, buf); + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", + default_hstate_max_huge_pages); + } + default_hstate.max_huge_pages = + default_hstate_max_huge_pages; + + for_each_online_node(i) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; + } + } + + hugetlb_cma_check(); + hugetlb_init_hstates(); + gather_bootmem_prealloc(); + report_hugepages(); + + hugetlb_sysfs_init(); + hugetlb_cgroup_file_init(); + +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + hugetlb_fault_mutex_table = + kmalloc_array(num_fault_mutexes, sizeof(struct mutex), + GFP_KERNEL); + BUG_ON(!hugetlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&hugetlb_fault_mutex_table[i]); + return 0; +} +subsys_initcall(hugetlb_init); + +/* Overwritten by architectures with more huge page sizes */ +bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size) +{ + return size == HPAGE_SIZE; +} + +void __init hugetlb_add_hstate(unsigned int order) +{ + struct hstate *h; + unsigned long i; + + if (size_to_hstate(PAGE_SIZE << order)) { + return; + } + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); + h->order = order; + h->mask = ~(huge_page_size(h) - 1); + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_activelist); + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/SZ_1K); + + parsed_hstate = h; +} + +bool __init __weak hugetlb_node_alloc_supported(void) +{ + return true; +} + +static void __init hugepages_clear_pages_in_node(void) +{ + if (!hugetlb_max_hstate) { + default_hstate_max_huge_pages = 0; + memset(default_hugepages_in_node, 0, + sizeof(default_hugepages_in_node)); + } else { + parsed_hstate->max_huge_pages = 0; + memset(parsed_hstate->max_huge_pages_node, 0, + sizeof(parsed_hstate->max_huge_pages_node)); + } +} + +/* + * hugepages command line processing + * hugepages normally follows a valid hugepagsz or default_hugepagsz + * specification. If not, ignore the hugepages value. hugepages can also + * be the first huge page command line option in which case it implicitly + * specifies the number of huge pages for the default size. + */ +static int __init hugepages_setup(char *s) +{ + unsigned long *mhp; + static unsigned long *last_mhp; + int node = NUMA_NO_NODE; + int count; + unsigned long tmp; + char *p = s; + + if (!parsed_valid_hugepagesz) { + pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); + parsed_valid_hugepagesz = true; + return 1; + } + + /* + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter + * yet, so this hugepages= parameter goes to the "default hstate". + * Otherwise, it goes with the previously parsed hugepagesz or + * default_hugepagesz. + */ + else if (!hugetlb_max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (mhp == last_mhp) { + pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s); + return 1; + } + + while (*p) { + count = 0; + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + /* Parameter is node format */ + if (p[count] == ':') { + if (!hugetlb_node_alloc_supported()) { + pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); + return 1; + } + if (tmp >= MAX_NUMNODES || !node_online(tmp)) + goto invalid; + node = array_index_nospec(tmp, MAX_NUMNODES); + p += count + 1; + /* Parse hugepages */ + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + if (!hugetlb_max_hstate) + default_hugepages_in_node[node] = tmp; + else + parsed_hstate->max_huge_pages_node[node] = tmp; + *mhp += tmp; + /* Go to parse next node*/ + if (p[count] == ',') + p += count + 1; + else + break; + } else { + if (p != s) + goto invalid; + *mhp = tmp; + break; + } + } + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate gigantic hstates here early to still + * use the bootmem allocator. + */ + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + + return 1; + +invalid: + pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); + hugepages_clear_pages_in_node(); + return 1; +} +__setup("hugepages=", hugepages_setup); + +/* + * hugepagesz command line processing + * A specific huge page size can only be specified once with hugepagesz. + * hugepagesz is followed by hugepages on the command line. The global + * variable 'parsed_valid_hugepagesz' is used to determine if prior + * hugepagesz argument was valid. + */ +static int __init hugepagesz_setup(char *s) +{ + unsigned long size; + struct hstate *h; + + parsed_valid_hugepagesz = false; + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported hugepagesz=%s\n", s); + return 1; + } + + h = size_to_hstate(size); + if (h) { + /* + * hstate for this size already exists. This is normally + * an error, but is allowed if the existing hstate is the + * default hstate. More specifically, it is only allowed if + * the number of huge pages for the default hstate was not + * previously specified. + */ + if (!parsed_default_hugepagesz || h != &default_hstate || + default_hstate.max_huge_pages) { + pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s); + return 1; + } + + /* + * No need to call hugetlb_add_hstate() as hstate already + * exists. But, do set parsed_hstate so that a following + * hugepages= parameter will be applied to this hstate. + */ + parsed_hstate = h; + parsed_valid_hugepagesz = true; + return 1; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + return 1; +} +__setup("hugepagesz=", hugepagesz_setup); + +/* + * default_hugepagesz command line input + * Only one instance of default_hugepagesz allowed on command line. + */ +static int __init default_hugepagesz_setup(char *s) +{ + unsigned long size; + int i; + + parsed_valid_hugepagesz = false; + if (parsed_default_hugepagesz) { + pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); + return 1; + } + + size = (unsigned long)memparse(s, NULL); + + if (!arch_hugetlb_valid_size(size)) { + pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s); + return 1; + } + + hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT); + parsed_valid_hugepagesz = true; + parsed_default_hugepagesz = true; + default_hstate_idx = hstate_index(size_to_hstate(size)); + + /* + * The number of default huge pages (for this size) could have been + * specified as the first hugetlb parameter: hugepages=X. If so, + * then default_hstate_max_huge_pages is set. If the default huge + * page size is gigantic (>= MAX_ORDER), then the pages must be + * allocated here from bootmem allocator. + */ + if (default_hstate_max_huge_pages) { + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + for_each_online_node(i) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; + if (hstate_is_gigantic(&default_hstate)) + hugetlb_hstate_alloc_pages(&default_hstate); + default_hstate_max_huge_pages = 0; + } + + return 1; +} +__setup("default_hugepagesz=", default_hugepagesz_setup); + +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + +static unsigned int allowed_mems_nr(struct hstate *h) +{ + int node; + unsigned int nr = 0; + nodemask_t *mbind_nodemask; + unsigned int *array = h->free_huge_pages_node; + gfp_t gfp_mask = htlb_alloc_mask(h); + + mbind_nodemask = policy_mbind_nodemask(gfp_mask); + for_each_node_mask(node, cpuset_current_mems_allowed) { + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) + nr += array[node]; + } + + return nr; +} + +#ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp = h->max_huge_pages; + int ret; + + if (!hugepages_supported()) + return -EOPNOTSUPP; + + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); + if (ret) + goto out; + + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); +out: + return ret; +} + +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + +int hugetlb_overcommit_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + struct hstate *h = &default_hstate; + unsigned long tmp; + int ret; + + if (!hugepages_supported()) + return -EOPNOTSUPP; + + tmp = h->nr_overcommit_huge_pages; + + if (write && hstate_is_gigantic(h)) + return -EINVAL; + + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); + if (ret) + goto out; + + if (write) { + spin_lock_irq(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock_irq(&hugetlb_lock); + } +out: + return ret; +} + +#endif /* CONFIG_SYSCTL */ + +void hugetlb_report_meminfo(struct seq_file *m) +{ + struct hstate *h; + unsigned long total = 0; + + if (!hugepages_supported()) + return; + + for_each_hstate(h) { + unsigned long count = h->nr_huge_pages; + + total += huge_page_size(h) * count; + + if (h == &default_hstate) + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + count, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + huge_page_size(h) / SZ_1K); + } + + seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); +} + +int hugetlb_report_node_meminfo(char *buf, int len, int nid) +{ + struct hstate *h = &default_hstate; + + if (!hugepages_supported()) + return 0; + + return sysfs_emit_at(buf, len, + "Node %d HugePages_Total: %5u\n" + "Node %d HugePages_Free: %5u\n" + "Node %d HugePages_Surp: %5u\n", + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); +} + +void hugetlb_show_meminfo_node(int nid) +{ + struct hstate *h; + + if (!hugepages_supported()) + return; + + for_each_hstate(h) + printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + huge_page_size(h) / SZ_1K); +} + +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "HugetlbPages:\t%8lu kB\n", + atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); +} + +/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ +unsigned long hugetlb_total_pages(void) +{ + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; +} + +static int hugetlb_acct_memory(struct hstate *h, long delta) +{ + int ret = -ENOMEM; + + if (!delta) + return 0; + + spin_lock_irq(&hugetlb_lock); + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + * + * Apart from cpuset, we also have memory policy mechanism that + * also determines from which node the kernel will allocate memory + * in a NUMA system. So similar to cpuset, we also should consider + * the memory policy of the current task. Similar to the description + * above. + */ + if (delta > 0) { + if (gather_surplus_pages(h, delta) < 0) + goto out; + + if (delta > allowed_mems_nr(h)) { + return_unused_surplus_pages(h, delta); + goto out; + } + } + + ret = 0; + if (delta < 0) + return_unused_surplus_pages(h, (unsigned long) -delta); + +out: + spin_unlock_irq(&hugetlb_lock); + return ret; +} + +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *resv = vma_resv_map(vma); + + /* + * HPAGE_RESV_OWNER indicates a private mapping. + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot disappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv_map_dup_hugetlb_cgroup_uncharge_info(resv); + kref_get(&resv->refs); + } + + /* + * vma_lock structure for sharable mappings is vma specific. + * Clear old pointer (if copied via vm_area_dup) and allocate + * new structure. Before clearing, make sure vma_lock is not + * for this vma. + */ + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock) { + if (vma_lock->vma != vma) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } else + pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); + } else + hugetlb_vma_lock_alloc(vma); + } +} + +static void hugetlb_vm_op_close(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + struct resv_map *resv; + struct hugepage_subpool *spool = subpool_vma(vma); + unsigned long reserve, start, end; + long gbl_reserve; + + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; + + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); + + reserve = (end - start) - region_count(resv, start, end); + hugetlb_cgroup_uncharge_counter(resv, start, end); + if (reserve) { + /* + * Decrement reserve counts. The global reserve count may be + * adjusted if the subpool has a minimum size. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, reserve); + hugetlb_acct_memory(h, -gbl_reserve); + } + + kref_put(&resv->refs, resv_map_release); +} + +static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) +{ + if (addr & ~(huge_page_mask(hstate_vma(vma)))) + return -EINVAL; + + /* + * PMD sharing is only possible for PUD_SIZE-aligned address ranges + * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this + * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + */ + if (addr & ~PUD_MASK) { + /* + * hugetlb_vm_op_split is called right before we attempt to + * split the VMA. We will need to unshare PMDs in the old and + * new VMAs, so let's unshare before we split. + */ + unsigned long floor = addr & PUD_MASK; + unsigned long ceil = floor + PUD_SIZE; + + if (floor >= vma->vm_start && ceil <= vma->vm_end) + hugetlb_unshare_pmds(vma, floor, ceil); + } + + return 0; +} + +static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) +{ + return huge_page_size(hstate_vma(vma)); +} + +/* + * We cannot handle pagefaults against hugetlb pages at all. They cause + * handle_mm_fault() to try to instantiate regular-sized pages in the + * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * this far. + */ +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) +{ + BUG(); + return 0; +} + +/* + * When a new function is introduced to vm_operations_struct and added + * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. + * This is because under System V memory model, mappings created via + * shmget/shmat with "huge page" specified are backed by hugetlbfs files, + * their original vm_ops are overwritten with shm_vm_ops. + */ +const struct vm_operations_struct hugetlb_vm_ops = { + .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, + .close = hugetlb_vm_op_close, + .may_split = hugetlb_vm_op_split, + .pagesize = hugetlb_vm_op_pagesize, +}; + +static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) +{ + pte_t entry; + unsigned int shift = huge_page_shift(hstate_vma(vma)); + + if (writable) { + entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, + vma->vm_page_prot))); + } else { + entry = huge_pte_wrprotect(mk_huge_pte(page, + vma->vm_page_prot)); + } + entry = pte_mkyoung(entry); + entry = arch_make_huge_pte(entry, shift, vma->vm_flags); + + return entry; +} + +static void set_huge_ptep_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + pte_t entry; + + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) + update_mmu_cache(vma, address, ptep); +} + +bool is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return false; + swp = pte_to_swp_entry(pte); + if (is_migration_entry(swp)) + return true; + else + return false; +} + +static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return false; + swp = pte_to_swp_entry(pte); + if (is_hwpoison_entry(swp)) + return true; + else + return false; +} + +static void +hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, + struct page *new_page) +{ + __SetPageUptodate(new_page); + hugepage_add_new_anon_rmap(new_page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); + hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); + ClearHPageRestoreReserve(new_page); + SetHPageMigratable(new_page); +} + +int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma) +{ + pte_t *src_pte, *dst_pte, entry; + struct page *ptepage; + unsigned long addr; + bool cow = is_cow_mapping(src_vma->vm_flags); + struct hstate *h = hstate_vma(src_vma); + unsigned long sz = huge_page_size(h); + unsigned long npages = pages_per_huge_page(h); + struct mmu_notifier_range range; + unsigned long last_addr_mask; + int ret = 0; + + if (cow) { + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src, + src_vma->vm_start, + src_vma->vm_end); + mmu_notifier_invalidate_range_start(&range); + mmap_assert_write_locked(src); + raw_write_seqcount_begin(&src->write_protect_seq); + } else { + /* + * For shared mappings the vma lock must be held before + * calling huge_pte_offset in the src vma. Otherwise, the + * returned ptep could go away if part of a shared pmd and + * another thread calls huge_pmd_unshare. + */ + hugetlb_vma_lock_read(src_vma); + } + + last_addr_mask = hugetlb_mask_last_page(h); + for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; + src_pte = huge_pte_offset(src, addr, sz); + if (!src_pte) { + addr |= last_addr_mask; + continue; + } + dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); + if (!dst_pte) { + ret = -ENOMEM; + break; + } + + /* + * If the pagetables are shared don't copy or take references. + * + * dst_pte == src_pte is the common case of src/dest sharing. + * However, src could have 'unshared' and dst shares with + * another vma. So page_count of ptep page is checked instead + * to reliably determine whether pte is shared. + */ + if (page_count(virt_to_page(dst_pte)) > 1) { + addr |= last_addr_mask; + continue; + } + + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); +again: + if (huge_pte_none(entry)) { + /* + * Skip if src entry none. + */ + ; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + bool uffd_wp = huge_pte_uffd_wp(entry); + + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (unlikely(is_hugetlb_entry_migration(entry))) { + swp_entry_t swp_entry = pte_to_swp_entry(entry); + bool uffd_wp = huge_pte_uffd_wp(entry); + + if (!is_readable_migration_entry(swp_entry) && cow) { + /* + * COW mappings require pages in both + * parent and child to be set to read. + */ + swp_entry = make_readable_migration_entry( + swp_offset(swp_entry)); + entry = swp_entry_to_pte(swp_entry); + if (userfaultfd_wp(src_vma) && uffd_wp) + entry = huge_pte_mkuffd_wp(entry); + set_huge_pte_at(src, addr, src_pte, entry); + } + if (!userfaultfd_wp(dst_vma) && uffd_wp) + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(dst, addr, dst_pte, entry); + } else if (unlikely(is_pte_marker(entry))) { + /* + * We copy the pte marker only if the dst vma has + * uffd-wp enabled. + */ + if (userfaultfd_wp(dst_vma)) + set_huge_pte_at(dst, addr, dst_pte, entry); + } else { + entry = huge_ptep_get(src_pte); + ptepage = pte_page(entry); + get_page(ptepage); + + /* + * Failing to duplicate the anon rmap is a rare case + * where we see pinned hugetlb pages while they're + * prone to COW. We need to do the COW earlier during + * fork. + * + * When pre-allocating the page or copying data, we + * need to be without the pgtable locks since we could + * sleep during the process. + */ + if (!PageAnon(ptepage)) { + page_dup_file_rmap(ptepage, true); + } else if (page_try_dup_anon_rmap(ptepage, true, + src_vma)) { + pte_t src_pte_old = entry; + struct page *new; + + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + /* Do not use reserve as it's private owned */ + new = alloc_huge_page(dst_vma, addr, 1); + if (IS_ERR(new)) { + put_page(ptepage); + ret = PTR_ERR(new); + break; + } + copy_user_huge_page(new, ptepage, addr, dst_vma, + npages); + put_page(ptepage); + + /* Install the new huge page if src pte stable */ + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + entry = huge_ptep_get(src_pte); + if (!pte_same(src_pte_old, entry)) { + restore_reserve_on_error(h, dst_vma, addr, + new); + put_page(new); + /* huge_ptep of dst_pte won't change as in child */ + goto again; + } + hugetlb_install_page(dst_vma, dst_pte, addr, new); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + continue; + } + + if (cow) { + /* + * No need to notify as we are downgrading page + * table protection not changing it to point + * to a new page. + * + * See Documentation/mm/mmu_notifier.rst + */ + huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_pte_wrprotect(entry); + } + + set_huge_pte_at(dst, addr, dst_pte, entry); + hugetlb_count_add(npages, dst); + } + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + } + + if (cow) { + raw_write_seqcount_end(&src->write_protect_seq); + mmu_notifier_invalidate_range_end(&range); + } else { + hugetlb_vma_unlock_read(src_vma); + } + + return ret; +} + +static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + spinlock_t *src_ptl, *dst_ptl; + pte_t pte; + + dst_ptl = huge_pte_lock(h, mm, dst_pte); + src_ptl = huge_pte_lockptr(h, mm, src_pte); + + /* + * We don't have to worry about the ordering of src and dst ptlocks + * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock. + */ + if (src_ptl != dst_ptl) + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + set_huge_pte_at(mm, new_addr, dst_pte, pte); + + if (src_ptl != dst_ptl) + spin_unlock(src_ptl); + spin_unlock(dst_ptl); +} + +int move_hugetlb_page_tables(struct vm_area_struct *vma, + struct vm_area_struct *new_vma, + unsigned long old_addr, unsigned long new_addr, + unsigned long len) +{ + struct hstate *h = hstate_vma(vma); + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + unsigned long old_end = old_addr + len; + unsigned long last_addr_mask; + pte_t *src_pte, *dst_pte; + struct mmu_notifier_range range; + bool shared_pmd = false; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr, + old_end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + /* + * In case of shared PMDs, we should cover the maximum possible + * range. + */ + flush_cache_range(vma, range.start, range.end); + + mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); + /* Prevent race with file truncation */ + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(mapping); + for (; old_addr < old_end; old_addr += sz, new_addr += sz) { + src_pte = huge_pte_offset(mm, old_addr, sz); + if (!src_pte) { + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; + continue; + } + if (huge_pte_none(huge_ptep_get(src_pte))) + continue; + + if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { + shared_pmd = true; + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; + continue; + } + + dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz); + if (!dst_pte) + break; + + move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); + } + + if (shared_pmd) + flush_tlb_range(vma, range.start, range.end); + else + flush_tlb_range(vma, old_end - len, old_end); + mmu_notifier_invalidate_range_end(&range); + i_mmap_unlock_write(mapping); + hugetlb_vma_unlock_write(vma); + + return len + old_addr - old_end; +} + +static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page, zap_flags_t zap_flags) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *ptep; + pte_t pte; + spinlock_t *ptl; + struct page *page; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mmu_notifier_range range; + unsigned long last_addr_mask; + bool force_flush = false; + + WARN_ON(!is_vm_hugetlb_page(vma)); + BUG_ON(start & ~huge_page_mask(h)); + BUG_ON(end & ~huge_page_mask(h)); + + /* + * This is a hugetlb vma, all the pte entries should point + * to huge page. + */ + tlb_change_page_size(tlb, sz); + tlb_start_vma(tlb, vma); + + /* + * If sharing possible, alert mmu notifiers of worst case. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); + address = start; + for (; address < end; address += sz) { + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) { + address |= last_addr_mask; + continue; + } + + ptl = huge_pte_lock(h, mm, ptep); + if (huge_pmd_unshare(mm, vma, address, ptep)) { + spin_unlock(ptl); + tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); + force_flush = true; + address |= last_addr_mask; + continue; + } + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) { + spin_unlock(ptl); + continue; + } + + /* + * Migrating hugepage or HWPoisoned hugepage is already + * unmapped and its refcount is dropped, so just clear pte here. + */ + if (unlikely(!pte_present(pte))) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP + /* + * If the pte was wr-protected by uffd-wp in any of the + * swap forms, meanwhile the caller does not want to + * drop the uffd-wp bit in this zap, then replace the + * pte with a marker. + */ + if (pte_swp_uffd_wp_any(pte) && + !(zap_flags & ZAP_FLAG_DROP_MARKER)) + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); + else +#endif + huge_pte_clear(mm, address, ptep, sz); + spin_unlock(ptl); + continue; + } + + page = pte_page(pte); + /* + * If a reference page is supplied, it is because a specific + * page is being unmapped, not a range. Ensure the page we + * are about to unmap is the actual page of interest. + */ + if (ref_page) { + if (page != ref_page) { + spin_unlock(ptl); + continue; + } + /* + * Mark the VMA as having unmapped its page so that + * future faults in this VMA will fail rather than + * looking like data was lost + */ + set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } + + pte = huge_ptep_get_and_clear(mm, address, ptep); + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); + if (huge_pte_dirty(pte)) + set_page_dirty(page); +#ifdef CONFIG_PTE_MARKER_UFFD_WP + /* Leave a uffd-wp pte marker if needed */ + if (huge_pte_uffd_wp(pte) && + !(zap_flags & ZAP_FLAG_DROP_MARKER)) + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif + hugetlb_count_sub(pages_per_huge_page(h), mm); + page_remove_rmap(page, vma, true); + + spin_unlock(ptl); + tlb_remove_page_size(tlb, page, huge_page_size(h)); + /* + * Bail out after unmapping reference page if supplied + */ + if (ref_page) + break; + } + mmu_notifier_invalidate_range_end(&range); + tlb_end_vma(tlb, vma); + + /* + * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We + * could defer the flush until now, since by holding i_mmap_rwsem we + * guaranteed that the last refernece would not be dropped. But we must + * do the flushing before we return, as otherwise i_mmap_rwsem will be + * dropped and the last reference to the shared PMDs page might be + * dropped as well. + * + * In theory we could defer the freeing of the PMD pages as well, but + * huge_pmd_unshare() relies on the exact page_count for the PMD page to + * detect sharing, so we cannot defer the release of the page either. + * Instead, do flush now. + */ + if (force_flush) + tlb_flush_mmu_tlbonly(tlb); +} + +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) +{ + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + + __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); + + if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ + /* + * Unlock and free the vma lock before releasing i_mmap_rwsem. + * When the vma_lock is freed, this makes the vma ineligible + * for pmd sharing. And, i_mmap_rwsem is required to set up + * pmd sharing. This is important as page tables for this + * unmapped range will be asynchrously deleted. If the page + * tables are shared, there will be issues when accessed by + * someone else. + */ + __hugetlb_vma_unlock_write_free(vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } else { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } +} + +void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) +{ + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, vma->vm_mm); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + tlb_finish_mmu(&tlb); +} + +/* + * This is called when the original mapper is failing to COW a MAP_PRIVATE + * mapping it owns the reserve page for. The intention is to unmap the page + * from other VMAs and let the children be SIGKILLed if they are faulting the + * same region. + */ +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) +{ + struct hstate *h = hstate_vma(vma); + struct vm_area_struct *iter_vma; + struct address_space *mapping; + pgoff_t pgoff; + + /* + * vm_pgoff is in PAGE_SIZE units, hence the different calculation + * from page cache lookup which is in HPAGE_SIZE units. + */ + address = address & huge_page_mask(h); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + mapping = vma->vm_file->f_mapping; + + /* + * Take the mapping lock for the duration of the table walk. As + * this mapping should be shared between all the VMAs, + * __unmap_hugepage_range() is called as the lock is already held + */ + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { + /* Do not unmap the current VMA */ + if (iter_vma == vma) + continue; + + /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + + /* + * Unmap the page from other VMAs without their own reserves. + * They get marked to be SIGKILLed if they fault in these + * areas. This is because a future no-page fault on this VMA + * could insert a zeroed page instead of the data existing + * from the time of fork. This would look like data corruption + */ + if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) + unmap_hugepage_range(iter_vma, address, + address + huge_page_size(h), page, 0); + } + i_mmap_unlock_write(mapping); +} + +/* + * hugetlb_wp() should be called with page lock of the original hugepage held. + * Called with hugetlb_fault_mutex_table held and pte_page locked so we + * cannot race with other handlers or page migration. + * Keep the pte_same checks anyway to make transition from the mutex easier. + */ +static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int flags, + struct page *pagecache_page, spinlock_t *ptl) +{ + const bool unshare = flags & FAULT_FLAG_UNSHARE; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(vma); + struct page *old_page, *new_page; + int outside_reserve = 0; + vm_fault_t ret = 0; + unsigned long haddr = address & huge_page_mask(h); + struct mmu_notifier_range range; + + VM_BUG_ON(unshare && (flags & FOLL_WRITE)); + VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); + + /* + * Never handle CoW for uffd-wp protected pages. It should be only + * handled when the uffd-wp protection is removed. + * + * Note that only the CoW optimization path (in hugetlb_no_page()) + * can trigger this, because hugetlb_fault() will always resolve + * uffd-wp bit first. + */ + if (!unshare && huge_pte_uffd_wp(pte)) + return 0; + + /* + * hugetlb does not support FOLL_FORCE-style write faults that keep the + * PTE mapped R/O such as maybe_mkwrite() would do. + */ + if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) + return VM_FAULT_SIGSEGV; + + /* Let's take out MAP_SHARED mappings first. */ + if (vma->vm_flags & VM_MAYSHARE) { + if (unlikely(unshare)) + return 0; + set_huge_ptep_writable(vma, haddr, ptep); + return 0; + } + + old_page = pte_page(pte); + + delayacct_wpcopy_start(); + +retry_avoidcopy: + /* + * If no-one else is actually using this page, we're the exclusive + * owner and can reuse this page. + */ + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + if (!PageAnonExclusive(old_page)) + page_move_anon_rmap(old_page, vma); + if (likely(!unshare)) + set_huge_ptep_writable(vma, haddr, ptep); + + delayacct_wpcopy_end(); + return 0; + } + VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page), + old_page); + + /* + * If the process that created a MAP_PRIVATE mapping is about to + * perform a COW due to a shared page count, attempt to satisfy + * the allocation without using the existing reserves. The pagecache + * page is used to determine if the reserve at this address was + * consumed or not. If reserves were used, a partial faulted mapping + * at the time of fork() could consume its reserves on COW instead + * of the full address range. + */ + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + old_page != pagecache_page) + outside_reserve = 1; + + get_page(old_page); + + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ + spin_unlock(ptl); + new_page = alloc_huge_page(vma, haddr, outside_reserve); + + if (IS_ERR(new_page)) { + /* + * If a process owning a MAP_PRIVATE mapping fails to COW, + * it is due to references held by a child and an insufficient + * huge page pool. To guarantee the original mappers + * reliability, unmap the page from child processes. The child + * may get SIGKILLed if it later faults. + */ + if (outside_reserve) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx; + u32 hash; + + put_page(old_page); + /* + * Drop hugetlb_fault_mutex and vma_lock before + * unmapping. unmapping needs to hold vma_lock + * in write mode. Dropping vma_lock in read mode + * here is OK as COW mappings do not interact with + * PMD sharing. + * + * Reacquire both after unmap operation. + */ + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + unmap_ref_private(mm, vma, old_page, haddr); + + mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); + spin_lock(ptl); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + delayacct_wpcopy_end(); + return 0; + } + + ret = vmf_error(PTR_ERR(new_page)); + goto out_release_old; + } + + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto out_release_all; + } + + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); + __SetPageUptodate(new_page); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + haddr + huge_page_size(h)); + mmu_notifier_invalidate_range_start(&range); + + /* + * Retake the page table lock to check for racing updates + * before the page tables are altered + */ + spin_lock(ptl); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + ClearHPageRestoreReserve(new_page); + + /* Break COW or unshare */ + huge_ptep_clear_flush(vma, haddr, ptep); + mmu_notifier_invalidate_range(mm, range.start, range.end); + page_remove_rmap(old_page, vma, true); + hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_huge_pte_at(mm, haddr, ptep, + make_huge_pte(vma, new_page, !unshare)); + SetHPageMigratable(new_page); + /* Make the old page be freed below */ + new_page = old_page; + } + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(&range); +out_release_all: + /* + * No restore in case of successful pagetable update (Break COW or + * unshare) + */ + if (new_page != old_page) + restore_reserve_on_error(h, vma, haddr, new_page); + put_page(new_page); +out_release_old: + put_page(old_page); + + spin_lock(ptl); /* Caller expects lock to be held */ + + delayacct_wpcopy_end(); + return ret; +} + +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t idx) +{ + struct folio *folio = page_folio(page); + struct inode *inode = mapping->host; + struct hstate *h = hstate_inode(inode); + int err; + + __folio_set_locked(folio); + err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); + + if (unlikely(err)) { + __folio_clear_locked(folio); + return err; + } + ClearHPageRestoreReserve(page); + + /* + * mark folio dirty so that it will not be removed from cache/file + * by non-hugetlbfs specific code paths. + */ + folio_mark_dirty(folio); + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + return 0; +} + +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, + unsigned int flags, + unsigned long haddr, + unsigned long addr, + unsigned long reason) +{ + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = addr, + .flags = flags, + + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * vma_lock and hugetlb_fault_mutex must be dropped before handling + * userfault. Also mmap_lock could be dropped due to handling + * userfault, any vma operation should be careful from here. + */ + hugetlb_vma_unlock_read(vma); + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, reason); +} + +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; +} + +static vm_fault_t hugetlb_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, + pte_t old_pte, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + vm_fault_t ret = VM_FAULT_SIGBUS; + int anon_rmap = 0; + unsigned long size; + struct page *page; + pte_t new_pte; + spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); + bool new_page, new_pagecache_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); + + /* + * Currently, we are forced to kill the process in the event the + * original mapper has unmapped pages from the child due to a failed + * COW/unsharing. Warn that such a situation has occurred as it may not + * be obvious. + */ + if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { + pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", + current->pid); + goto out; + } + + /* + * Use page lock to guard against racing truncation + * before we get page_table_lock. + */ + new_page = false; + page = find_lock_page(mapping, idx); + if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + /* Check for page in userfault range */ + if (userfaultfd_missing(vma)) { + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); + } + + page = alloc_huge_page(vma, haddr, 0); + if (IS_ERR(page)) { + /* + * Returning error will result in faulting task being + * sent SIGBUS. The hugetlb fault mutex prevents two + * tasks from racing to fault in the same page which + * could result in false unable to allocate errors. + * Page migration does not take the fault mutex, but + * does a clear then write of pte's under page table + * lock. Page fault code could race with migration, + * notice the clear pte and try to allocate a page + * here. Before returning error, get ptl and make + * sure there really is no pte entry. + */ + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + ret = vmf_error(PTR_ERR(page)); + else + ret = 0; + goto out; + } + clear_huge_page(page, address, pages_per_huge_page(h)); + __SetPageUptodate(page); + new_page = true; + + if (vma->vm_flags & VM_MAYSHARE) { + int err = hugetlb_add_to_page_cache(page, mapping, idx); + if (err) { + /* + * err can't be -EEXIST which implies someone + * else consumed the reservation since hugetlb + * fault mutex is held when add a hugetlb page + * to the page cache. So it's safe to call + * restore_reserve_on_error() here. + */ + restore_reserve_on_error(h, vma, haddr, page); + put_page(page); + goto out; + } + new_pagecache_page = true; + } else { + lock_page(page); + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + anon_rmap = 1; + } + } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto backout_unlocked; + } + + /* Check for page in userfault range. */ + if (userfaultfd_minor(vma)) { + unlock_page(page); + put_page(page); + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); + } + } + + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if (vma_needs_reservation(h, vma, haddr) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + /* Just decrements count, does not deallocate */ + vma_end_reservation(h, vma, haddr); + } + + ptl = huge_pte_lock(h, mm, ptep); + ret = 0; + /* If pte changed from under us, retry */ + if (!pte_same(huge_ptep_get(ptep), old_pte)) + goto backout; + + if (anon_rmap) { + ClearHPageRestoreReserve(page); + hugepage_add_new_anon_rmap(page, vma, haddr); + } else + page_dup_file_rmap(page, true); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + /* + * If this pte was previously wr-protected, keep it wr-protected even + * if populated. + */ + if (unlikely(pte_marker_uffd_wp(old_pte))) + new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte)); + set_huge_pte_at(mm, haddr, ptep, new_pte); + + hugetlb_count_add(pages_per_huge_page(h), mm); + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + /* Optimization, do the COW without a second fault */ + ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl); + } + + spin_unlock(ptl); + + /* + * Only set HPageMigratable in newly allocated pages. Existing pages + * found in the pagecache may not have HPageMigratableset if they have + * been isolated for migration. + */ + if (new_page) + SetHPageMigratable(page); + + unlock_page(page); +out: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return ret; + +backout: + spin_unlock(ptl); +backout_unlocked: + if (new_page && !new_pagecache_page) + restore_reserve_on_error(h, vma, haddr, page); + + unlock_page(page); + put_page(page); + goto out; +} + +#ifdef CONFIG_SMP +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) +{ + unsigned long key[2]; + u32 hash; + + key[0] = (unsigned long) mapping; + key[1] = idx; + + hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocessor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) +{ + return 0; +} +#endif + +vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + pte_t *ptep, entry; + spinlock_t *ptl; + vm_fault_t ret; + u32 hash; + pgoff_t idx; + struct page *page = NULL; + struct page *pagecache_page = NULL; + struct hstate *h = hstate_vma(vma); + struct address_space *mapping; + int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); + + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait_huge(vma, ptep); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + } + + /* + * Serialize hugepage allocation and instantiation, so that we don't + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* + * Acquire vma lock before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something has changed. + */ + hugetlb_vma_lock_read(vma); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) { + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return VM_FAULT_OOM; + } + + entry = huge_ptep_get(ptep); + /* PTE markers should be handled the same way as none pte */ + if (huge_pte_none_mostly(entry)) + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + entry, flags); + + ret = 0; + + /* + * entry could be a migration/hwpoison entry at this point, so this + * check prevents the kernel from going below assuming that we have + * an active hugepage in pagecache. This goto expects the 2nd page + * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will + * properly handle it. + */ + if (!pte_present(entry)) + goto out_mutex; + + /* + * If we are going to COW/unshare the mapping later, we examine the + * pending reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. Also lookup the pagecache page now as it is used to + * determine if a reservation has been consumed. + */ + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { + if (vma_needs_reservation(h, vma, haddr) < 0) { + ret = VM_FAULT_OOM; + goto out_mutex; + } + /* Just decrements count, does not deallocate */ + vma_end_reservation(h, vma, haddr); + + pagecache_page = find_lock_page(mapping, idx); + } + + ptl = huge_pte_lock(h, mm, ptep); + + /* Check for a racing update before calling hugetlb_wp() */ + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_ptl; + + /* Handle userfault-wp first, before trying to lock more pages */ + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; + + spin_unlock(ptl); + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, VM_UFFD_WP); + } + + /* + * hugetlb_wp() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + */ + page = pte_page(entry); + if (page != pagecache_page) + if (!trylock_page(page)) { + need_wait_lock = 1; + goto out_ptl; + } + + get_page(page); + + if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { + if (!huge_pte_write(entry)) { + ret = hugetlb_wp(mm, vma, address, ptep, flags, + pagecache_page, ptl); + goto out_put_page; + } else if (likely(flags & FAULT_FLAG_WRITE)) { + entry = huge_pte_mkdirty(entry); + } + } + entry = pte_mkyoung(entry); + if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + flags & FAULT_FLAG_WRITE)) + update_mmu_cache(vma, haddr, ptep); +out_put_page: + if (page != pagecache_page) + unlock_page(page); + put_page(page); +out_ptl: + spin_unlock(ptl); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } +out_mutex: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + /* + * Generally it's safe to hold refcount during waiting page lock. But + * here we just wait to defer the next page fault to avoid busy loop and + * the page is not used after unlocked before returning from the current + * page fault. So we are safe from accessing freed page, even if we wait + * here without taking refcount. + */ + if (need_wait_lock) + wait_on_page_locked(page); + return ret; +} + +#ifdef CONFIG_USERFAULTFD +/* + * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with + * modifications for huge pages. + */ +int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + enum mcopy_atomic_mode mode, + struct page **pagep, + bool wp_copy) +{ + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); + struct hstate *h = hstate_vma(dst_vma); + struct address_space *mapping = dst_vma->vm_file->f_mapping; + pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); + unsigned long size; + int vm_shared = dst_vma->vm_flags & VM_SHARED; + pte_t _dst_pte; + spinlock_t *ptl; + int ret = -ENOMEM; + struct page *page; + int writable; + bool page_in_pagecache = false; + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; + page_in_pagecache = true; + } else if (!*pagep) { + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + goto out; + } + + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) { + ret = -ENOMEM; + goto out; + } + + ret = copy_huge_page_from_user(page, + (const void __user *) src_addr, + pages_per_huge_page(h), false); + + /* fallback to copy_from_user outside mmap_lock */ + if (unlikely(ret)) { + ret = -ENOENT; + /* Free the allocated page which may have + * consumed a reservation. + */ + restore_reserve_on_error(h, dst_vma, dst_addr, page); + put_page(page); + + /* Allocate a temporary page to hold the copied + * contents. + */ + page = alloc_huge_page_vma(h, dst_vma, dst_addr); + if (!page) { + ret = -ENOMEM; + goto out; + } + *pagep = page; + /* Set the outparam pagep and return to the caller to + * copy the contents outside the lock. Don't free the + * page. + */ + goto out; + } + } else { + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + put_page(*pagep); + ret = -EEXIST; + *pagep = NULL; + goto out; + } + + page = alloc_huge_page(dst_vma, dst_addr, 0); + if (IS_ERR(page)) { + put_page(*pagep); + ret = -ENOMEM; + *pagep = NULL; + goto out; + } + copy_user_huge_page(page, *pagep, dst_addr, dst_vma, + pages_per_huge_page(h)); + put_page(*pagep); + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + /* Add shared, newly allocated pages to the page cache. */ + if (vm_shared && !is_continue) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + ret = -EFAULT; + if (idx >= size) + goto out_release_nounlock; + + /* + * Serialization between remove_inode_hugepages() and + * hugetlb_add_to_page_cache() below happens through the + * hugetlb_fault_mutex_table that here must be hold by + * the caller. + */ + ret = hugetlb_add_to_page_cache(page, mapping, idx); + if (ret) + goto out_release_nounlock; + page_in_pagecache = true; + } + + ptl = huge_pte_lock(h, dst_mm, dst_pte); + + ret = -EIO; + if (PageHWPoison(page)) + goto out_release_unlock; + + /* + * We allow to overwrite a pte marker: consider when both MISSING|WP + * registered, we firstly wr-protect a none pte which has no page cache + * page backing it, then access the page. + */ + ret = -EEXIST; + if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) + goto out_release_unlock; + + if (page_in_pagecache) { + page_dup_file_rmap(page, true); + } else { + ClearHPageRestoreReserve(page); + hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); + } + + /* + * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY + * with wp flag set, don't set pte write bit. + */ + if (wp_copy || (is_continue && !vm_shared)) + writable = 0; + else + writable = dst_vma->vm_flags & VM_WRITE; + + _dst_pte = make_huge_pte(dst_vma, page, writable); + /* + * Always mark UFFDIO_COPY page dirty; note that this may not be + * extremely important for hugetlbfs for now since swapping is not + * supported, but we should still be clear in that this page cannot be + * thrown away at will, even if write bit not set. + */ + _dst_pte = huge_pte_mkdirty(_dst_pte); + _dst_pte = pte_mkyoung(_dst_pte); + + if (wp_copy) + _dst_pte = huge_pte_mkuffd_wp(_dst_pte); + + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + hugetlb_count_add(pages_per_huge_page(h), dst_mm); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + if (!is_continue) + SetHPageMigratable(page); + if (vm_shared || is_continue) + unlock_page(page); + ret = 0; +out: + return ret; +out_release_unlock: + spin_unlock(ptl); + if (vm_shared || is_continue) + unlock_page(page); +out_release_nounlock: + if (!page_in_pagecache) + restore_reserve_on_error(h, dst_vma, dst_addr, page); + put_page(page); + goto out; +} +#endif /* CONFIG_USERFAULTFD */ + +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages, + struct vm_area_struct **vmas) +{ + int nr; + + for (nr = 0; nr < refs; nr++) { + if (likely(pages)) + pages[nr] = nth_page(page, nr); + if (vmas) + vmas[nr] = vma; + } +} + +static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, + bool *unshare) +{ + pte_t pteval = huge_ptep_get(pte); + + *unshare = false; + if (is_swap_pte(pteval)) + return true; + if (huge_pte_write(pteval)) + return false; + if (flags & FOLL_WRITE) + return true; + if (gup_must_unshare(flags, pte_page(pteval))) { + *unshare = true; + return true; + } + return false; +} + +long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, + struct page **pages, struct vm_area_struct **vmas, + unsigned long *position, unsigned long *nr_pages, + long i, unsigned int flags, int *locked) +{ + unsigned long pfn_offset; + unsigned long vaddr = *position; + unsigned long remainder = *nr_pages; + struct hstate *h = hstate_vma(vma); + int err = -EFAULT, refs; + + while (vaddr < vma->vm_end && remainder) { + pte_t *pte; + spinlock_t *ptl = NULL; + bool unshare = false; + int absent; + struct page *page; + + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (fatal_signal_pending(current)) { + remainder = 0; + break; + } + + /* + * Some archs (sparc64, sh*) have multiple pte_ts to + * each hugepage. We have to make sure we get the + * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. + */ + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), + huge_page_size(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); + remainder = 0; + break; + } + + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || + __follow_hugetlb_must_fault(flags, pte, &unshare)) { + vm_fault_t ret; + unsigned int fault_flags = 0; + + if (pte) + spin_unlock(ptl); + if (flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + else if (unshare) + fault_flags |= FAULT_FLAG_UNSHARE; + if (locked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_KILLABLE; + if (flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | + FAULT_FLAG_RETRY_NOWAIT; + if (flags & FOLL_TRIED) { + /* + * Note: FAULT_FLAG_ALLOW_RETRY and + * FAULT_FLAG_TRIED can co-exist + */ + fault_flags |= FAULT_FLAG_TRIED; + } + ret = hugetlb_fault(mm, vma, vaddr, fault_flags); + if (ret & VM_FAULT_ERROR) { + err = vm_fault_to_errno(ret, flags); + remainder = 0; + break; + } + if (ret & VM_FAULT_RETRY) { + if (locked && + !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) + *locked = 0; + *nr_pages = 0; + /* + * VM_FAULT_RETRY must not return an + * error, it will return zero + * instead. + * + * No need to update "position" as the + * caller will not check it after + * *nr_pages is set to 0. + */ + return i; + } + continue; + } + + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; + page = pte_page(huge_ptep_get(pte)); + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + + /* + * If subpage information not requested, update counters + * and skip the same_page loop below. + */ + if (!pages && !vmas && !pfn_offset && + (vaddr + huge_page_size(h) < vma->vm_end) && + (remainder >= pages_per_huge_page(h))) { + vaddr += huge_page_size(h); + remainder -= pages_per_huge_page(h); + i += pages_per_huge_page(h); + spin_unlock(ptl); + continue; + } + + /* vaddr may not be aligned to PAGE_SIZE */ + refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, + (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); + + if (pages || vmas) + record_subpages_vmas(nth_page(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL, + vmas ? vmas + i : NULL); + + if (pages) { + /* + * try_grab_folio() should always succeed here, + * because: a) we hold the ptl lock, and b) we've just + * checked that the huge page is present in the page + * tables. If the huge page is present, then the tail + * pages must also be present. The ptl prevents the + * head page and tail pages from being rearranged in + * any way. So this page must be available at this + * point, unless the page refcount overflowed: + */ + if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, + flags))) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } + + vaddr += (refs << PAGE_SHIFT); + remainder -= refs; + i += refs; + + spin_unlock(ptl); + } + *nr_pages = remainder; + /* + * setting position is actually required only if remainder is + * not zero but it's faster not to add a "if (remainder)" + * branch. + */ + *position = vaddr; + + return i ? i : err; +} + +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, + pgprot_t newprot, unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long start = address; + pte_t *ptep; + pte_t pte; + struct hstate *h = hstate_vma(vma); + unsigned long pages = 0, psize = huge_page_size(h); + bool shared_pmd = false; + struct mmu_notifier_range range; + unsigned long last_addr_mask; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + + /* + * In the case of shared PMDs, the area to flush could be beyond + * start/end. Set range.start/range.end to cover the maximum possible + * range if PMD sharing is possible. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, + 0, vma, mm, start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + + BUG_ON(address >= end); + flush_cache_range(vma, range.start, range.end); + + mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + last_addr_mask = hugetlb_mask_last_page(h); + for (; address < end; address += psize) { + spinlock_t *ptl; + ptep = huge_pte_offset(mm, address, psize); + if (!ptep) { + if (!uffd_wp) { + address |= last_addr_mask; + continue; + } + /* + * Userfaultfd wr-protect requires pgtable + * pre-allocations to install pte markers. + */ + ptep = huge_pte_alloc(mm, vma, address, psize); + if (!ptep) + break; + } + ptl = huge_pte_lock(h, mm, ptep); + if (huge_pmd_unshare(mm, vma, address, ptep)) { + /* + * When uffd-wp is enabled on the vma, unshare + * shouldn't happen at all. Warn about it if it + * happened due to some reason. + */ + WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); + pages++; + spin_unlock(ptl); + shared_pmd = true; + address |= last_addr_mask; + continue; + } + pte = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + /* Nothing to do. */ + } else if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + struct page *page = pfn_swap_entry_to_page(entry); + pte_t newpte = pte; + + if (is_writable_migration_entry(entry)) { + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + pages++; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + if (!pte_same(pte, newpte)) + set_huge_pte_at(mm, address, ptep, newpte); + } else if (unlikely(is_pte_marker(pte))) { + /* No other markers apply for now. */ + WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); + if (uffd_wp_resolve) + /* Safe to modify directly (non-present->none). */ + huge_pte_clear(mm, address, ptep, psize); + } else if (!huge_pte_none(pte)) { + pte_t old_pte; + unsigned int shift = huge_page_shift(hstate_vma(vma)); + + old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + pte = huge_pte_modify(old_pte, newprot); + pte = arch_make_huge_pte(pte, shift, vma->vm_flags); + if (uffd_wp) + pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte)); + else if (uffd_wp_resolve) + pte = huge_pte_clear_uffd_wp(pte); + huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); + pages++; + } else { + /* None pte */ + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } + spin_unlock(ptl); + } + /* + * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_rwsem, another task can do the final put_page + * and that page table be reused and filled with junk. If we actually + * did unshare a page of pmds, flush the range corresponding to the pud. + */ + if (shared_pmd) + flush_hugetlb_tlb_range(vma, range.start, range.end); + else + flush_hugetlb_tlb_range(vma, start, end); + /* + * No need to call mmu_notifier_invalidate_range() we are downgrading + * page table protection not changing it to point to a new page. + * + * See Documentation/mm/mmu_notifier.rst + */ + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + mmu_notifier_invalidate_range_end(&range); + + return pages << h->order; +} + +/* Return true if reservation was successful, false otherwise. */ +bool hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + long chg, add = -1; + struct hstate *h = hstate_inode(inode); + struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; + struct hugetlb_cgroup *h_cg = NULL; + long gbl_reserve, regions_needed = 0; + + /* This should never happen */ + if (from > to) { + VM_WARN(1, "%s called with a negative range\n", __func__); + return false; + } + + /* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ + hugetlb_vma_lock_alloc(vma); + + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * without using reserves + */ + if (vm_flags & VM_NORESERVE) + return true; + + /* + * Shared mappings base their reservation on the number of pages that + * are already allocated on behalf of the file. Private mappings need + * to reserve the full area even if read-only as mprotect() may be + * called to make the mapping read-write. Assume !vma is a shm mapping + */ + if (!vma || vma->vm_flags & VM_MAYSHARE) { + /* + * resv_map can not be NULL as hugetlb_reserve_pages is only + * called for inodes for which resv_maps were created (see + * hugetlbfs_get_inode). + */ + resv_map = inode_resv_map(inode); + + chg = region_chg(resv_map, from, to, ®ions_needed); + } else { + /* Private mapping. */ + resv_map = resv_map_alloc(); + if (!resv_map) + goto out_err; + + chg = to - from; + + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } + + if (chg < 0) + goto out_err; + + if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), + chg * pages_per_huge_page(h), &h_cg) < 0) + goto out_err; + + if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + /* For private mappings, the hugetlb_cgroup uncharge info hangs + * of the resv_map. + */ + resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h); + } + + /* + * There must be enough pages in the subpool for the mapping. If + * the subpool has a minimum size, there may be some global + * reservations already in place (gbl_reserve). + */ + gbl_reserve = hugepage_subpool_get_pages(spool, chg); + if (gbl_reserve < 0) + goto out_uncharge_cgroup; + + /* + * Check enough hugepages are available for the reservation. + * Hand the pages back to the subpool if there are not + */ + if (hugetlb_acct_memory(h, gbl_reserve) < 0) + goto out_put_pages; + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ + if (!vma || vma->vm_flags & VM_MAYSHARE) { + add = region_add(resv_map, from, to, regions_needed, h, h_cg); + + if (unlikely(add < 0)) { + hugetlb_acct_memory(h, -gbl_reserve); + goto out_put_pages; + } else if (unlikely(chg > add)) { + /* + * pages in this range were added to the reserve + * map between region_chg and region_add. This + * indicates a race with alloc_huge_page. Adjust + * the subpool and reserve counts modified above + * based on the difference. + */ + long rsv_adjust; + + /* + * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the + * reference to h_cg->css. See comment below for detail. + */ + hugetlb_cgroup_uncharge_cgroup_rsvd( + hstate_index(h), + (chg - add) * pages_per_huge_page(h), h_cg); + + rsv_adjust = hugepage_subpool_put_pages(spool, + chg - add); + hugetlb_acct_memory(h, -rsv_adjust); + } else if (h_cg) { + /* + * The file_regions will hold their own reference to + * h_cg->css. So we should release the reference held + * via hugetlb_cgroup_charge_cgroup_rsvd() when we are + * done. + */ + hugetlb_cgroup_put_rsvd_cgroup(h_cg); + } + } + return true; + +out_put_pages: + /* put back original number of pages, chg */ + (void)hugepage_subpool_put_pages(spool, chg); +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), + chg * pages_per_huge_page(h), h_cg); +out_err: + hugetlb_vma_lock_free(vma); + if (!vma || vma->vm_flags & VM_MAYSHARE) + /* Only call region_abort if the region_chg succeeded but the + * region_add failed or didn't run. + */ + if (chg >= 0 && add < 0) + region_abort(resv_map, from, to, regions_needed); + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + kref_put(&resv_map->refs, resv_map_release); + set_vma_resv_map(vma, NULL); + } + return false; +} + +long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed) +{ + struct hstate *h = hstate_inode(inode); + struct resv_map *resv_map = inode_resv_map(inode); + long chg = 0; + struct hugepage_subpool *spool = subpool_inode(inode); + long gbl_reserve; + + /* + * Since this routine can be called in the evict inode path for all + * hugetlbfs inodes, resv_map could be NULL. + */ + if (resv_map) { + chg = region_del(resv_map, start, end); + /* + * region_del() can fail in the rare case where a region + * must be split and another region descriptor can not be + * allocated. If end == LONG_MAX, it will not fail. + */ + if (chg < 0) + return chg; + } + + spin_lock(&inode->i_lock); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); + spin_unlock(&inode->i_lock); + + /* + * If the subpool has a minimum size, the number of global + * reservations to be released may be adjusted. + * + * Note that !resv_map implies freed == 0. So (chg - freed) + * won't go negative. + */ + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); + hugetlb_acct_memory(h, -gbl_reserve); + + return 0; +} + +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +static unsigned long page_table_shareable(struct vm_area_struct *svma, + struct vm_area_struct *vma, + unsigned long addr, pgoff_t idx) +{ + unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + + svma->vm_start; + unsigned long sbase = saddr & PUD_MASK; + unsigned long s_end = sbase + PUD_SIZE; + + /* Allow segments to share if only one is marked locked */ + unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; + + /* + * match the virtual addresses, permission and the alignment of the + * page table page. + * + * Also, vma_lock (vm_private_data) is required for sharing. + */ + if (pmd_index(addr) != pmd_index(saddr) || + vm_flags != svm_flags || + !range_in_vma(svma, sbase, s_end) || + !svma->vm_private_data) + return 0; + + return saddr; +} + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif + /* + * check on proper vm_flags and page table alignment + */ + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (!vma->vm_private_data) /* vma lock required for sharing */ + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; +} + +/* + * Determine if start,end range within vma could be mapped by shared pmd. + * If yes, adjust start and end to cover range associated with possible + * shared pmd mappings. + */ +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), + v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + /* + * vma needs to span at least one aligned PUD size, and the range + * must be at least partially within in. + */ + if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || + (*end <= v_start) || (*start >= v_end)) + return; + + /* Extend the range to be PUD aligned for a worst case scenario */ + if (*start > v_start) + *start = ALIGN_DOWN(*start, PUD_SIZE); + + if (*end < v_end) + *end = ALIGN(*end, PUD_SIZE); +} + +/* + * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() + * and returns the corresponding pte. While this is not necessary for the + * !shared pmd case because we can allocate the pmd later as well, it makes the + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. + */ +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; + struct vm_area_struct *svma; + unsigned long saddr; + pte_t *spte = NULL; + pte_t *pte; + spinlock_t *ptl; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { + if (svma == vma) + continue; + + saddr = page_table_shareable(svma, vma, addr, idx); + if (saddr) { + spte = huge_pte_offset(svma->vm_mm, saddr, + vma_mmu_pagesize(svma)); + if (spte) { + get_page(virt_to_page(spte)); + break; + } + } + } + + if (!spte) + goto out; + + ptl = huge_pte_lock(hstate_vma(vma), mm, spte); + if (pud_none(*pud)) { + pud_populate(mm, pud, + (pmd_t *)((unsigned long)spte & PAGE_MASK)); + mm_inc_nr_pmds(mm); + } else { + put_page(virt_to_page(spte)); + } + spin_unlock(ptl); +out: + pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_read(mapping); + return pte; +} + +/* + * unmap huge page backed by shared pte. + * + * Hugetlb pte page is ref counted at the time of mapping. If pte is shared + * indicated by page_count > 1, unmap is achieved by clearing pud and + * decrementing the ref count. If count == 1, the pte page is not shared. + * + * Called with page table lock held. + * + * returns: 1 successfully unmapped a shared pte page + * 0 the underlying pte page is not shared, or it is the last user + */ +int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); + BUG_ON(page_count(virt_to_page(ptep)) == 0); + if (page_count(virt_to_page(ptep)) == 1) + return 0; + + pud_clear(pud); + put_page(virt_to_page(ptep)); + mm_dec_nr_pmds(mm); + return 1; +} + +#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ + return NULL; +} + +int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + return 0; +} + +void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (pud) { + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (want_pmd_share(vma, addr) && pud_none(*pud)) + pte = huge_pmd_share(mm, vma, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } + } + BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte)); + + return pte; +} + +/* + * huge_pte_offset() - Walk the page table to resolve the hugepage + * entry at address @addr + * + * Return: Pointer to page table entry (PUD or PMD) for + * address @addr, or NULL if a !p*d_present() entry is encountered and the + * size @sz doesn't match the hugepage size at this level of the page + * table. + */ +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return NULL; + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (sz == PUD_SIZE) + /* must be pud huge, non-present or none */ + return (pte_t *)pud; + if (!pud_present(*pud)) + return NULL; + /* must have a valid entry and size to go further */ + + pmd = pmd_offset(pud, addr); + /* must be pmd huge, non-present or none */ + return (pte_t *)pmd; +} + +/* + * Return a mask that can be used to update an address to the last huge + * page in a page table page mapping size. Used to skip non-present + * page table entries when linearly scanning address ranges. Architectures + * with unique huge page to page table relationships can define their own + * version of this routine. + */ +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + if (hp_size == PUD_SIZE) + return P4D_SIZE - PUD_SIZE; + else if (hp_size == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; + else + return 0UL; +} + +#else + +/* See description above. Architectures can provide their own version. */ +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) +{ +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + if (huge_page_size(h) == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; +#endif + return 0UL; +} + +#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ + +/* + * These functions are overwritable if your architecture needs its own + * behavior. + */ +struct page * __weak +follow_huge_addr(struct mm_struct *mm, unsigned long address, + int write) +{ + return ERR_PTR(-EINVAL); +} + +struct page * __weak +follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, int pdshift) +{ + WARN(1, "hugepd follow called with no support for hugepage directory format\n"); + return NULL; +} + +struct page * __weak +follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + spinlock_t *ptl; + pte_t *ptep, pte; + + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + ptep = huge_pte_offset(mm, address, huge_page_size(h)); + if (!ptep) + return NULL; + + ptl = huge_pte_lock(h, mm, ptep); + pte = huge_ptep_get(ptep); + if (pte_present(pte)) { + page = pte_page(pte) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* + * try_grab_page() should always succeed here, because: a) we + * hold the pmd (ptl) lock, and b) we've just checked that the + * huge pmd (head) page is present in the page tables. The ptl + * prevents the head page and tail pages from being rearranged + * in any way. So this page must be available at this point, + * unless the page refcount overflowed: + */ + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(pte)) { + spin_unlock(ptl); + __migration_entry_wait_huge(ptep, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + +struct page * __weak +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int flags) +{ + struct page *page = NULL; + spinlock_t *ptl; + pte_t pte; + + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); + if (!pud_huge(*pud)) + goto out; + pte = huge_ptep_get((pte_t *)pud); + if (pte_present(pte)) { + page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(pte)) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pud, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + +struct page * __weak +follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) +{ + if (flags & (FOLL_GET | FOLL_PIN)) + return NULL; + + return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); +} + +int isolate_hugetlb(struct page *page, struct list_head *list) +{ + int ret = 0; + + spin_lock_irq(&hugetlb_lock); + if (!PageHeadHuge(page) || + !HPageMigratable(page) || + !get_page_unless_zero(page)) { + ret = -EBUSY; + goto unlock; + } + ClearHPageMigratable(page); + list_move_tail(&page->lru, list); +unlock: + spin_unlock_irq(&hugetlb_lock); + return ret; +} + +int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + int ret = 0; + + *hugetlb = false; + spin_lock_irq(&hugetlb_lock); + if (PageHeadHuge(page)) { + *hugetlb = true; + if (HPageFreed(page)) + ret = 0; + else if (HPageMigratable(page)) + ret = get_page_unless_zero(page); + else + ret = -EBUSY; + } + spin_unlock_irq(&hugetlb_lock); + return ret; +} + +int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + int ret; + + spin_lock_irq(&hugetlb_lock); + ret = __get_huge_page_for_hwpoison(pfn, flags); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + +void putback_active_hugepage(struct page *page) +{ + spin_lock_irq(&hugetlb_lock); + SetHPageMigratable(page); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock_irq(&hugetlb_lock); + put_page(page); +} + +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +{ + struct hstate *h = page_hstate(oldpage); + + hugetlb_cgroup_migrate(oldpage, newpage); + set_page_owner_migrate_reason(newpage, reason); + + /* + * transfer temporary state of the new huge page. This is + * reverse to other transitions because the newpage is going to + * be final while the old one will be freed so it takes over + * the temporary status. + * + * Also note that we have to transfer the per-node surplus state + * here as well otherwise the global surplus count will not match + * the per-node's. + */ + if (HPageTemporary(newpage)) { + int old_nid = page_to_nid(oldpage); + int new_nid = page_to_nid(newpage); + + SetHPageTemporary(oldpage); + ClearHPageTemporary(newpage); + + /* + * There is no need to transfer the per-node surplus state + * when we do not cross the node. + */ + if (new_nid == old_nid) + return; + spin_lock_irq(&hugetlb_lock); + if (h->surplus_huge_pages_node[old_nid]) { + h->surplus_huge_pages_node[old_nid]--; + h->surplus_huge_pages_node[new_nid]++; + } + spin_unlock_irq(&hugetlb_lock); + } +} + +static void hugetlb_unshare_pmds(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + if (start >= end) + return; + + flush_cache_range(vma, start, end); + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + huge_pmd_unshare(mm, vma, address, ptep); + spin_unlock(ptl); + } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/mm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); +} + +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), + ALIGN_DOWN(vma->vm_end, PUD_SIZE)); +} + +#ifdef CONFIG_CMA +static bool cma_reserve_called __initdata; + +static int __init cmdline_parse_hugetlb_cma(char *p) +{ + int nid, count = 0; + unsigned long tmp; + char *s = p; + + while (*s) { + if (sscanf(s, "%lu%n", &tmp, &count) != 1) + break; + + if (s[count] == ':') { + if (tmp >= MAX_NUMNODES) + break; + nid = array_index_nospec(tmp, MAX_NUMNODES); + + s += count + 1; + tmp = memparse(s, &s); + hugetlb_cma_size_in_node[nid] = tmp; + hugetlb_cma_size += tmp; + + /* + * Skip the separator if have one, otherwise + * break the parsing. + */ + if (*s == ',') + s++; + else + break; + } else { + hugetlb_cma_size = memparse(p, &p); + break; + } + } + + return 0; +} + +early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); + +void __init hugetlb_cma_reserve(int order) +{ + unsigned long size, reserved, per_node; + bool node_specific_cma_alloc = false; + int nid; + + cma_reserve_called = true; + + if (!hugetlb_cma_size) + return; + + for (nid = 0; nid < MAX_NUMNODES; nid++) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + if (!node_online(nid)) { + pr_warn("hugetlb_cma: invalid node %d specified\n", nid); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + continue; + } + + if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n", + nid, (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; + hugetlb_cma_size_in_node[nid] = 0; + } else { + node_specific_cma_alloc = true; + } + } + + /* Validate the CMA size again in case some invalid nodes specified. */ + if (!hugetlb_cma_size) + return; + + if (hugetlb_cma_size < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", + (PAGE_SIZE << order) / SZ_1M); + hugetlb_cma_size = 0; + return; + } + + if (!node_specific_cma_alloc) { + /* + * If 3 GB area is requested on a machine with 4 numa nodes, + * let's allocate 1 GB on first three nodes and ignore the last one. + */ + per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); + pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", + hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + } + + reserved = 0; + for_each_online_node(nid) { + int res; + char name[CMA_MAX_NAME]; + + if (node_specific_cma_alloc) { + if (hugetlb_cma_size_in_node[nid] == 0) + continue; + + size = hugetlb_cma_size_in_node[nid]; + } else { + size = min(per_node, hugetlb_cma_size - reserved); + } + + size = round_up(size, PAGE_SIZE << order); + + snprintf(name, sizeof(name), "hugetlb%d", nid); + /* + * Note that 'order per bit' is based on smallest size that + * may be returned to CMA allocator in the case of + * huge page demotion. + */ + res = cma_declare_contiguous_nid(0, size, 0, + PAGE_SIZE << HUGETLB_PAGE_ORDER, + 0, false, name, + &hugetlb_cma[nid], nid); + if (res) { + pr_warn("hugetlb_cma: reservation failed: err %d, node %d", + res, nid); + continue; + } + + reserved += size; + pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", + size / SZ_1M, nid); + + if (reserved >= hugetlb_cma_size) + break; + } + + if (!reserved) + /* + * hugetlb_cma_size is used to determine if allocations from + * cma are possible. Set to zero if no cma regions are set up. + */ + hugetlb_cma_size = 0; +} + +static void __init hugetlb_cma_check(void) +{ + if (!hugetlb_cma_size || cma_reserve_called) + return; + + pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); +} + +#endif /* CONFIG_CMA */ diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 000000000..f61d132df --- /dev/null +++ b/mm/hugetlb_cgroup.c @@ -0,0 +1,919 @@ +/* + * + * Copyright IBM Corporation, 2012 + * Author Aneesh Kumar K.V + * + * Cgroup v2 + * Copyright (C) 2019 Red Hat, Inc. + * Author: Giuseppe Scrivano + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include +#include + +#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) +#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + +static struct hugetlb_cgroup *root_h_cgroup __read_mostly; + +static inline struct page_counter * +__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, + bool rsvd) +{ + if (rsvd) + return &h_cg->rsvd_hugepage[idx]; + return &h_cg->hugepage[idx]; +} + +static inline struct page_counter * +hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) +{ + return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); +} + +static inline struct page_counter * +hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) +{ + return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); +} + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; +} + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) +{ + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); +} + +static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) +{ + return (h_cg == root_h_cgroup); +} + +static inline struct hugetlb_cgroup * +parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) +{ + return hugetlb_cgroup_from_css(h_cg->css.parent); +} + +static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) +{ + struct hstate *h; + + for_each_hstate(h) { + if (page_counter_read( + hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) + return true; + } + return false; +} + +static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, + struct hugetlb_cgroup *parent_h_cgroup) +{ + int idx; + + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { + struct page_counter *fault_parent = NULL; + struct page_counter *rsvd_parent = NULL; + unsigned long limit; + int ret; + + if (parent_h_cgroup) { + fault_parent = hugetlb_cgroup_counter_from_cgroup( + parent_h_cgroup, idx); + rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( + parent_h_cgroup, idx); + } + page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, + idx), + fault_parent); + page_counter_init( + hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), + rsvd_parent); + + limit = round_down(PAGE_COUNTER_MAX, + pages_per_huge_page(&hstates[idx])); + + ret = page_counter_set_max( + hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), + limit); + VM_BUG_ON(ret); + ret = page_counter_set_max( + hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), + limit); + VM_BUG_ON(ret); + } +} + +static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) +{ + int node; + + for_each_node(node) + kfree(h_cgroup->nodeinfo[node]); + kfree(h_cgroup); +} + +static struct cgroup_subsys_state * +hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); + struct hugetlb_cgroup *h_cgroup; + int node; + + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), + GFP_KERNEL); + + if (!h_cgroup) + return ERR_PTR(-ENOMEM); + + if (!parent_h_cgroup) + root_h_cgroup = h_cgroup; + + /* + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + for_each_node(node) { + /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ + int node_to_alloc = + node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; + h_cgroup->nodeinfo[node] = + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), + GFP_KERNEL, node_to_alloc); + if (!h_cgroup->nodeinfo[node]) + goto fail_alloc_nodeinfo; + } + + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); + return &h_cgroup->css; + +fail_alloc_nodeinfo: + hugetlb_cgroup_free(h_cgroup); + return ERR_PTR(-ENOMEM); +} + +static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) +{ + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); +} + +/* + * Should be called with hugetlb_lock held. + * Since we are holding hugetlb_lock, pages cannot get moved from + * active list or uncharged from the cgroup, So no need to get + * page reference and test for page active here. This function + * cannot fail. + */ +static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, + struct page *page) +{ + unsigned int nr_pages; + struct page_counter *counter; + struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + + page_hcg = hugetlb_cgroup_from_page(page); + /* + * We can have pages in active list without any cgroup + * ie, hugepage with less than 3 pages. We can safely + * ignore those pages. + */ + if (!page_hcg || page_hcg != h_cg) + goto out; + + nr_pages = compound_nr(page); + if (!parent) { + parent = root_h_cgroup; + /* root has no limit */ + page_counter_charge(&parent->hugepage[idx], nr_pages); + } + counter = &h_cg->hugepage[idx]; + /* Take the pages off the local counter */ + page_counter_cancel(counter, nr_pages); + + set_hugetlb_cgroup(page, parent); +out: + return; +} + +/* + * Force the hugetlb cgroup to empty the hugetlb resources by moving them to + * the parent cgroup. + */ +static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hstate *h; + struct page *page; + + do { + for_each_hstate(h) { + spin_lock_irq(&hugetlb_lock); + list_for_each_entry(page, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); + + spin_unlock_irq(&hugetlb_lock); + } + cond_resched(); + } while (hugetlb_cgroup_have_usage(h_cg)); +} + +static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, + enum hugetlb_memory_event event) +{ + atomic_long_inc(&hugetlb->events_local[idx][event]); + cgroup_file_notify(&hugetlb->events_local_file[idx]); + + do { + atomic_long_inc(&hugetlb->events[idx][event]); + cgroup_file_notify(&hugetlb->events_file[idx]); + } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && + !hugetlb_cgroup_is_root(hugetlb)); +} + +static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr, + bool rsvd) +{ + int ret = 0; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = NULL; + + if (hugetlb_cgroup_disabled()) + goto done; + /* + * We don't charge any cgroup if the compound page have less + * than 3 pages. + */ + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + goto done; +again: + rcu_read_lock(); + h_cg = hugetlb_cgroup_from_task(current); + if (!css_tryget(&h_cg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + if (!page_counter_try_charge( + __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), + nr_pages, &counter)) { + ret = -ENOMEM; + hugetlb_event(h_cg, idx, HUGETLB_MAX); + css_put(&h_cg->css); + goto done; + } + /* Reservations take a reference to the css because they do not get + * reparented. + */ + if (!rsvd) + css_put(&h_cg->css); +done: + *ptr = h_cg; + return ret; +} + +int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); +} + +int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); +} + +/* Should be called with hugetlb_lock held */ +static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page, bool rsvd) +{ + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + __set_hugetlb_cgroup(page, h_cg, rsvd); + if (!rsvd) { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage + nr_pages); + } +} + +void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); +} + +void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); +} + +/* + * Should be called with hugetlb_lock held + */ +static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page, bool rsvd) +{ + struct hugetlb_cgroup *h_cg; + + if (hugetlb_cgroup_disabled()) + return; + lockdep_assert_held(&hugetlb_lock); + h_cg = __hugetlb_cgroup_from_page(page, rsvd); + if (unlikely(!h_cg)) + return; + __set_hugetlb_cgroup(page, NULL, rsvd); + + page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, + rsvd), + nr_pages); + + if (rsvd) + css_put(&h_cg->css); + else { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage - nr_pages); + } +} + +void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page) +{ + __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); +} + +void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, + struct page *page) +{ + __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); +} + +static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + bool rsvd) +{ + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + return; + + page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, + rsvd), + nr_pages); + + if (rsvd) + css_put(&h_cg->css); +} + +void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); +} + +void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); +} + +void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, + unsigned long end) +{ + if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || + !resv->css) + return; + + page_counter_uncharge(resv->reservation_counter, + (end - start) * resv->pages_per_hpage); + css_put(resv->css); +} + +void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, + struct file_region *rg, + unsigned long nr_pages, + bool region_del) +{ + if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) + return; + + if (rg->reservation_counter && resv->pages_per_hpage && + !resv->reservation_counter) { + page_counter_uncharge(rg->reservation_counter, + nr_pages * resv->pages_per_hpage); + /* + * Only do css_put(rg->css) when we delete the entire region + * because one file_region must hold exactly one css reference. + */ + if (region_del) + css_put(rg->css); + } +} + +enum { + RES_USAGE, + RES_RSVD_USAGE, + RES_LIMIT, + RES_RSVD_LIMIT, + RES_MAX_USAGE, + RES_RSVD_MAX_USAGE, + RES_FAILCNT, + RES_RSVD_FAILCNT, +}; + +static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) +{ + int nid; + struct cftype *cft = seq_cft(seq); + int idx = MEMFILE_IDX(cft->private); + bool legacy = MEMFILE_ATTR(cft->private); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + struct cgroup_subsys_state *css; + unsigned long usage; + + if (legacy) { + /* Add up usage across all nodes for the non-hierarchical total. */ + usage = 0; + for_each_node_state(nid, N_MEMORY) + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); + + /* Simply print the per-node usage for the non-hierarchical total. */ + for_each_node_state(nid, N_MEMORY) + seq_printf(seq, " N%d=%lu", nid, + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * + PAGE_SIZE); + seq_putc(seq, '\n'); + } + + /* + * The hierarchical total is pretty much the value recorded by the + * counter, so use that. + */ + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); + + /* + * For each node, transverse the css tree to obtain the hierarchical + * node usage. + */ + for_each_node_state(nid, N_MEMORY) { + usage = 0; + rcu_read_lock(); + css_for_each_descendant_pre(css, &h_cg->css) { + usage += READ_ONCE(hugetlb_cgroup_from_css(css) + ->nodeinfo[nid] + ->usage[idx]); + } + rcu_read_unlock(); + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); + } + + seq_putc(seq, '\n'); + + return 0; +} + +static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct page_counter *counter; + struct page_counter *rsvd_counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; + rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_RSVD_USAGE: + return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->max * PAGE_SIZE; + case RES_RSVD_LIMIT: + return (u64)rsvd_counter->max * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_RSVD_MAX_USAGE: + return (u64)rsvd_counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + case RES_RSVD_FAILCNT: + return rsvd_counter->failcnt; + default: + BUG(); + } +} + +static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) +{ + int idx; + u64 val; + struct cftype *cft = seq_cft(seq); + unsigned long limit; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + counter = &h_cg->hugepage[idx]; + + limit = round_down(PAGE_COUNTER_MAX, + pages_per_huge_page(&hstates[idx])); + + switch (MEMFILE_ATTR(cft->private)) { + case RES_RSVD_USAGE: + counter = &h_cg->rsvd_hugepage[idx]; + fallthrough; + case RES_USAGE: + val = (u64)page_counter_read(counter); + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + case RES_RSVD_LIMIT: + counter = &h_cg->rsvd_hugepage[idx]; + fallthrough; + case RES_LIMIT: + val = (u64)counter->max; + if (val == limit) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + default: + BUG(); + } + + return 0; +} + +static DEFINE_MUTEX(hugetlb_limit_mutex); + +static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, + const char *max) +{ + int ret, idx; + unsigned long nr_pages; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + bool rsvd = false; + + if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ + return -EINVAL; + + buf = strstrip(buf); + ret = page_counter_memparse(buf, max, &nr_pages); + if (ret) + return ret; + + idx = MEMFILE_IDX(of_cft(of)->private); + nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_RSVD_LIMIT: + rsvd = true; + fallthrough; + case RES_LIMIT: + mutex_lock(&hugetlb_limit_mutex); + ret = page_counter_set_max( + __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), + nr_pages); + mutex_unlock(&hugetlb_limit_mutex); + break; + default: + ret = -EINVAL; + break; + } + return ret ?: nbytes; +} + +static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); +} + +static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); +} + +static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + int ret = 0; + struct page_counter *counter, *rsvd_counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + + counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; + rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_MAX_USAGE: + page_counter_reset_watermark(counter); + break; + case RES_RSVD_MAX_USAGE: + page_counter_reset_watermark(rsvd_counter); + break; + case RES_FAILCNT: + counter->failcnt = 0; + break; + case RES_RSVD_FAILCNT: + rsvd_counter->failcnt = 0; + break; + default: + ret = -EINVAL; + break; + } + return ret ?: nbytes; +} + +static char *mem_fmt(char *buf, int size, unsigned long hsize) +{ + if (hsize >= SZ_1G) + snprintf(buf, size, "%luGB", hsize / SZ_1G); + else if (hsize >= SZ_1M) + snprintf(buf, size, "%luMB", hsize / SZ_1M); + else + snprintf(buf, size, "%luKB", hsize / SZ_1K); + return buf; +} + +static int __hugetlb_events_show(struct seq_file *seq, bool local) +{ + int idx; + long max; + struct cftype *cft = seq_cft(seq); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + + if (local) + max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); + else + max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); + + seq_printf(seq, "max %lu\n", max); + + return 0; +} + +static int hugetlb_events_show(struct seq_file *seq, void *v) +{ + return __hugetlb_events_show(seq, false); +} + +static int hugetlb_events_local_show(struct seq_file *seq, void *v) +{ + return __hugetlb_events_show(seq, true); +} + +static void __init __hugetlb_cgroup_file_dfl_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, sizeof(buf), huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files_dfl[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->write = hugetlb_cgroup_write_dfl; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the reservation limit file */ + cft = &h->cgroup_files_dfl[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->write = hugetlb_cgroup_write_dfl; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the current usage file */ + cft = &h->cgroup_files_dfl[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the current reservation usage file */ + cft = &h->cgroup_files_dfl[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events file */ + cft = &h->cgroup_files_dfl[4]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_events_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events.local file */ + cft = &h->cgroup_files_dfl[5]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_events_local_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, + events_local_file[idx]); + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the numa stat file */ + cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[7]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files_dfl)); +} + +static void __init __hugetlb_cgroup_file_legacy_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, sizeof(buf), huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files_legacy[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->read_u64 = hugetlb_cgroup_read_u64; + cft->write = hugetlb_cgroup_write_legacy; + + /* Add the reservation limit file */ + cft = &h->cgroup_files_legacy[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); + cft->read_u64 = hugetlb_cgroup_read_u64; + cft->write = hugetlb_cgroup_write_legacy; + + /* Add the usage file */ + cft = &h->cgroup_files_legacy[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the reservation usage file */ + cft = &h->cgroup_files_legacy[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the MAX usage file */ + cft = &h->cgroup_files_legacy[4]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the MAX reservation usage file */ + cft = &h->cgroup_files_legacy[5]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the failcntfile */ + cft = &h->cgroup_files_legacy[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the reservation failcntfile */ + cft = &h->cgroup_files_legacy[7]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); + cft->write = hugetlb_cgroup_reset; + cft->read_u64 = hugetlb_cgroup_read_u64; + + /* Add the numa stat file */ + cft = &h->cgroup_files_legacy[8]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 1); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_legacy[9]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files_legacy)); +} + +static void __init __hugetlb_cgroup_file_init(int idx) +{ + __hugetlb_cgroup_file_dfl_init(idx); + __hugetlb_cgroup_file_legacy_init(idx); +} + +void __init hugetlb_cgroup_file_init(void) +{ + struct hstate *h; + + for_each_hstate(h) { + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].private for storing cgroup details. + */ + if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) + __hugetlb_cgroup_file_init(hstate_index(h)); + } +} + +/* + * hugetlb_lock will make sure a parallel cgroup rmdir won't happen + * when we migrate hugepages + */ +void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +{ + struct hugetlb_cgroup *h_cg; + struct hugetlb_cgroup *h_cg_rsvd; + struct hstate *h = page_hstate(oldhpage); + + if (hugetlb_cgroup_disabled()) + return; + + spin_lock_irq(&hugetlb_lock); + h_cg = hugetlb_cgroup_from_page(oldhpage); + h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); + set_hugetlb_cgroup(oldhpage, NULL); + set_hugetlb_cgroup_rsvd(oldhpage, NULL); + + /* move the h_cg details to new cgroup */ + set_hugetlb_cgroup(newhpage, h_cg); + set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); + list_move(&newhpage->lru, &h->hugepage_activelist); + spin_unlock_irq(&hugetlb_lock); + return; +} + +static struct cftype hugetlb_files[] = { + {} /* terminate */ +}; + +struct cgroup_subsys hugetlb_cgrp_subsys = { + .css_alloc = hugetlb_cgroup_css_alloc, + .css_offline = hugetlb_cgroup_css_offline, + .css_free = hugetlb_cgroup_css_free, + .dfl_cftypes = hugetlb_files, + .legacy_cftypes = hugetlb_files, +}; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c new file mode 100644 index 000000000..c04214055 --- /dev/null +++ b/mm/hugetlb_vmemmap.c @@ -0,0 +1,577 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HugeTLB Vmemmap Optimization (HVO) + * + * Copyright (c) 2020, ByteDance. All rights reserved. + * + * Author: Muchun Song + * + * See Documentation/mm/vmemmap_dedup.rst + */ +#define pr_fmt(fmt) "HugeTLB: " fmt + +#include +#include +#include +#include +#include +#include "hugetlb_vmemmap.h" + +/** + * struct vmemmap_remap_walk - walk vmemmap page table + * + * @remap_pte: called for each lowest-level entry (PTE). + * @nr_walked: the number of walked pte. + * @reuse_page: the page which is reused for the tail vmemmap pages. + * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. + */ +struct vmemmap_remap_walk { + void (*remap_pte)(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk); + unsigned long nr_walked; + struct page *reuse_page; + unsigned long reuse_addr; + struct list_head *vmemmap_pages; +}; + +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + pmd_t __pmd; + int i; + unsigned long addr = start; + struct page *head; + pte_t *pgtable; + + spin_lock(&init_mm.page_table_lock); + head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; + spin_unlock(&init_mm.page_table_lock); + + if (!head) + return 0; + + pgtable = pte_alloc_one_kernel(&init_mm); + if (!pgtable) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, &__pmd, pgtable); + + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + pgprot_t pgprot = PAGE_KERNEL; + + entry = mk_pte(head + i, pgprot); + pte = pte_offset_kernel(&__pmd, addr); + set_pte_at(&init_mm, addr, pte, entry); + } + + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(head)) + split_page(head, get_order(PMD_SIZE)); + + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} + +static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + /* + * The reuse_page is found 'first' in table walk before we start + * remapping (which is calling @walk->remap_pte). + */ + if (!walk->reuse_page) { + walk->reuse_page = pte_page(*pte); + /* + * Because the reuse address is part of the range that we are + * walking, skip the reuse address range. + */ + addr += PAGE_SIZE; + pte++; + walk->nr_walked++; + } + + for (; addr != end; addr += PAGE_SIZE, pte++) { + walk->remap_pte(pte, addr, walk); + walk->nr_walked++; + } +} + +static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; + + next = pmd_addr_end(addr, end); + vmemmap_pte_range(pmd, addr, next, walk); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + int ret; + + next = pud_addr_end(addr, end); + ret = vmemmap_pmd_range(pud, addr, next, walk); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + int ret; + + next = p4d_addr_end(addr, end); + ret = vmemmap_pud_range(p4d, addr, next, walk); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_remap_range(unsigned long start, unsigned long end, + struct vmemmap_remap_walk *walk) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + pgd = pgd_offset_k(addr); + do { + int ret; + + next = pgd_addr_end(addr, end); + ret = vmemmap_p4d_range(pgd, addr, next, walk); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + + /* + * We only change the mapping of the vmemmap virtual address range + * [@start + PAGE_SIZE, end), so we only need to flush the TLB which + * belongs to the range. + */ + flush_tlb_kernel_range(start + PAGE_SIZE, end); + + return 0; +} + +/* + * Free a vmemmap page. A vmemmap page can be allocated from the memblock + * allocator or buddy allocator. If the PG_reserved flag is set, it means + * that it allocated from the memblock allocator, just free it via the + * free_bootmem_page(). Otherwise, use __free_page(). + */ +static inline void free_vmemmap_page(struct page *page) +{ + if (PageReserved(page)) + free_bootmem_page(page); + else + __free_page(page); +} + +/* Free a list of the vmemmap pages */ +static void free_vmemmap_page_list(struct list_head *list) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + free_vmemmap_page(page); + } +} + +static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + /* + * Remap the tail pages as read-only to catch illegal write operation + * to the tail pages. + */ + pgprot_t pgprot = PAGE_KERNEL_RO; + pte_t entry = mk_pte(walk->reuse_page, pgprot); + struct page *page = pte_page(*pte); + + list_add_tail(&page->lru, walk->vmemmap_pages); + set_pte_at(&init_mm, addr, pte, entry); +} + +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + struct page *from = start + NR_RESET_STRUCT_PAGE; + + BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); + memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); +} + +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); + + /* + * Makes sure that preceding stores to the page contents become visible + * before the set_pte_at() write. + */ + smp_wmb(); + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +/** + * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) + * to the page which @reuse is mapped to, then free vmemmap + * which the range are mapped to. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_free(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_remap_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* + * In order to make remapping routine most efficient for the huge pages, + * the routine of vmemmap page table walking has the following rules + * (see more details from the vmemmap_pte_range()): + * + * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) + * should be continuous. + * - The @reuse address is part of the range [@reuse, @end) that we are + * walking which is passed to vmemmap_remap_range(). + * - The @reuse address is the first in the complete range. + * + * So we need to make sure that @start and @reuse meet the above rules. + */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + if (ret && walk.nr_walked) { + end = reuse + walk.nr_walked * PAGE_SIZE; + /* + * vmemmap_pages contains pages from the previous + * vmemmap_remap_range call which failed. These + * are pages which were removed from the vmemmap. + * They will be restored in the following call. + */ + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + vmemmap_remap_range(reuse, end, &walk); + } + mmap_read_unlock(&init_mm); + + free_vmemmap_page_list(&vmemmap_pages); + + return ret; +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + mmap_read_lock(&init_mm); + vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return 0; +} + +DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); +EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); + +static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); +core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); + +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. + */ +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +{ + int ret; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!HPageVmemmapOptimized(head)) + return 0; + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * The pages which the vmemmap virtual address range [@vmemmap_start, + * @vmemmap_end) are mapped to are freed to the buddy allocator, and + * the range is mapped to the page which @vmemmap_reuse is mapped to. + * When a HugeTLB page is freed to the buddy allocator, previously + * discarded vmemmap pages must be allocated and remapping. + */ + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, + GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); + if (!ret) { + ClearHPageVmemmapOptimized(head); + static_branch_dec(&hugetlb_optimize_vmemmap_key); + } + + return ret; +} + +/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ +static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) +{ + if (!READ_ONCE(vmemmap_optimize_enabled)) + return false; + + if (!hugetlb_vmemmap_optimizable(h)) + return false; + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + pmd_t *pmdp, pmd; + struct page *vmemmap_page; + unsigned long vaddr = (unsigned long)head; + + /* + * Only the vmemmap page's vmemmap page can be self-hosted. + * Walking the page tables to find the backing page of the + * vmemmap page. + */ + pmdp = pmd_off_k(vaddr); + /* + * The READ_ONCE() is used to stabilize *pmdp in a register or + * on the stack so that it will stop changing under the code. + * The only concurrent operation where it can be changed is + * split_vmemmap_huge_pmd() (*pmdp will be stable after this + * operation). + */ + pmd = READ_ONCE(*pmdp); + if (pmd_leaf(pmd)) + vmemmap_page = pmd_page(pmd) + pte_index(vaddr); + else + vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); + /* + * Due to HugeTLB alignment requirements and the vmemmap pages + * being at the start of the hotplugged memory region in + * memory_hotplug.memmap_on_memory case. Checking any vmemmap + * page's vmemmap page if it is marked as VmemmapSelfHosted is + * sufficient. + * + * [ hotplugged memory ] + * [ section ][...][ section ] + * [ vmemmap ][ usable memory ] + * ^ | | | + * +---+ | | + * ^ | | + * +-------+ | + * ^ | + * +-------------------------------------------+ + */ + if (PageVmemmapSelfHosted(vmemmap_page)) + return false; + } + + return true; +} + +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!vmemmap_should_optimize(h, head)) + return; + + static_branch_inc(&hugetlb_optimize_vmemmap_key); + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) + * to the page which @vmemmap_reuse is mapped to, then free the pages + * which the range [@vmemmap_start, @vmemmap_end] is mapped to. + */ + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) + static_branch_dec(&hugetlb_optimize_vmemmap_key); + else + SetHPageVmemmapOptimized(head); +} + +static struct ctl_table hugetlb_vmemmap_sysctls[] = { + { + .procname = "hugetlb_optimize_vmemmap", + .data = &vmemmap_optimize_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dobool, + }, + { } +}; + +static int __init hugetlb_vmemmap_init(void) +{ + /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ + BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + + if (IS_ENABLED(CONFIG_PROC_SYSCTL)) { + const struct hstate *h; + + for_each_hstate(h) { + if (hugetlb_vmemmap_optimizable(h)) { + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + break; + } + } + } + return 0; +} +late_initcall(hugetlb_vmemmap_init); diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h new file mode 100644 index 000000000..25bd0e002 --- /dev/null +++ b/mm/hugetlb_vmemmap.h @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HugeTLB Vmemmap Optimization (HVO) + * + * Copyright (c) 2020, ByteDance. All rights reserved. + * + * Author: Muchun Song + */ +#ifndef _LINUX_HUGETLB_VMEMMAP_H +#define _LINUX_HUGETLB_VMEMMAP_H +#include + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); + +/* + * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See + * Documentation/vm/vmemmap_dedup.rst. + */ +#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE + +static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) +{ + return pages_per_huge_page(h) * sizeof(struct page); +} + +/* + * Return how many vmemmap size associated with a HugeTLB page that can be + * optimized and can be freed to the buddy allocator. + */ +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) +{ + int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; + + if (!is_power_of_2(sizeof(struct page))) + return 0; + return size > 0 ? size : 0; +} +#else +static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +{ + return 0; +} + +static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +{ +} + +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) +{ + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ + +static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) +{ + return hugetlb_vmemmap_optimizable_size(h) != 0; +} +#endif /* _LINUX_HUGETLB_VMEMMAP_H */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 000000000..d0548e382 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Inject a hwpoison memory failure on a arbitrary pfn */ +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct dentry *hwpoison_dir; + +static int hwpoison_inject(void *data, u64 val) +{ + unsigned long pfn = val; + struct page *p; + struct page *hpage; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + hpage = compound_head(p); + + if (!hwpoison_filter_enable) + goto inject; + + shake_page(hpage); + /* + * This implies unable to support non-LRU pages except free page. + */ + if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p)) + return 0; + + /* + * do a racy check to make sure PG_hwpoison will only be set for + * the targeted owner (or on a free page). + * memory_failure() will redo the check reliably inside page lock. + */ + err = hwpoison_filter(hpage); + if (err) + return 0; + +inject: + pr_info("Injecting memory failure at pfn %#lx\n", pfn); + err = memory_failure(pfn, MF_SW_SIMULATED); + return (err == -EOPNOTSUPP) ? 0 : err; +} + +static int hwpoison_unpoison(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return unpoison_memory(val); +} + +DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); + +static void __exit pfn_inject_exit(void) +{ + hwpoison_filter_enable = 0; + debugfs_remove_recursive(hwpoison_dir); +} + +static int __init pfn_inject_init(void) +{ + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, NULL, + &hwpoison_fops); + + debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, NULL, + &unpoison_fops); + + debugfs_create_u32("corrupt-filter-enable", 0600, hwpoison_dir, + &hwpoison_filter_enable); + + debugfs_create_u32("corrupt-filter-dev-major", 0600, hwpoison_dir, + &hwpoison_filter_dev_major); + + debugfs_create_u32("corrupt-filter-dev-minor", 0600, hwpoison_dir, + &hwpoison_filter_dev_minor); + + debugfs_create_u64("corrupt-filter-flags-mask", 0600, hwpoison_dir, + &hwpoison_filter_flags_mask); + + debugfs_create_u64("corrupt-filter-flags-value", 0600, hwpoison_dir, + &hwpoison_filter_flags_value); + +#ifdef CONFIG_MEMCG + debugfs_create_u64("corrupt-filter-memcg", 0600, hwpoison_dir, + &hwpoison_filter_memcg); +#endif + + return 0; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000..c9327abb7 --- /dev/null +++ b/mm/init-mm.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifndef INIT_MM_CONTEXT +#define INIT_MM_CONTEXT(name) +#endif + +/* + * For dynamically allocated mm_structs, there is a dynamically sized cpumask + * at the end of the structure, the size of which depends on the maximum CPU + * number the system can see. That way we allocate only as much memory for + * mm_cpumask() as needed for the hundreds, or thousands of processes that + * a system typically runs. + * + * Since there is only one init_mm in the entire system, keep it simple + * and size this cpu_bitmask to NR_CPUS. + */ +struct mm_struct init_mm = { + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq), + MMAP_LOCK_INITIALIZER(init_mm) + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .user_ns = &init_user_ns, + .cpu_bitmap = CPU_BITS_NONE, +#ifdef CONFIG_IOMMU_SVA + .pasid = INVALID_IOASID, +#endif + INIT_MM_CONTEXT(init_mm) +}; + +void setup_initial_init_mm(void *start_code, void *end_code, + void *end_data, void *brk) +{ + init_mm.start_code = (unsigned long)start_code; + init_mm.end_code = (unsigned long)end_code; + init_mm.end_data = (unsigned long)end_data; + init_mm.brk = (unsigned long)brk; +} diff --git a/mm/internal.h b/mm/internal.h new file mode 100644 index 000000000..d01130efc --- /dev/null +++ b/mm/internal.h @@ -0,0 +1,870 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* internal.h: mm/ internal definitions + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ +#ifndef __MM_INTERNAL_H +#define __MM_INTERNAL_H + +#include +#include +#include +#include +#include + +struct folio_batch; + +/* + * The set of flags that only affect watermark checking and reclaim + * behaviour. This is used by the MM to obey the caller constraints + * about IO, FS and watermark checking while ignoring placement + * hints such as HIGHMEM usage. + */ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ + __GFP_ATOMIC|__GFP_NOLOCKDEP) + +/* The GFP flags allowed during early boot */ +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) + +/* Control allocation cpuset and node placement constraints */ +#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) + +/* Do not use these with a slab allocator */ +#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) + +/* + * Different from WARN_ON_ONCE(), no warning will be issued + * when we specify __GFP_NOWARN. + */ +#define WARN_ON_ONCE_GFP(cond, gfp) ({ \ + static bool __section(".data.once") __warned; \ + int __ret_warn_once = !!(cond); \ + \ + if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \ + __warned = true; \ + WARN_ON(1); \ + } \ + unlikely(__ret_warn_once); \ +}) + +void page_writeback_init(void); + +static inline void *folio_raw_mapping(struct folio *folio) +{ + unsigned long mapping = (unsigned long)folio->mapping; + + return (void *)(mapping & ~PAGE_MAPPING_FLAGS); +} + +void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, + int nr_throttled); +static inline void acct_reclaim_writeback(struct folio *folio) +{ + pg_data_t *pgdat = folio_pgdat(folio); + int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled); + + if (nr_throttled) + __acct_reclaim_writeback(pgdat, folio, nr_throttled); +} + +static inline void wake_throttle_isolated(pg_data_t *pgdat) +{ + wait_queue_head_t *wqh; + + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED]; + if (waitqueue_active(wqh)) + wake_up(wqh); +} + +vm_fault_t do_swap_page(struct vm_fault *vmf); +void folio_rotate_reclaimable(struct folio *folio); +bool __folio_end_writeback(struct folio *folio); +void deactivate_file_folio(struct folio *folio); +void folio_activate(struct folio *folio); + +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); + +struct zap_details; +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); + +void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, + unsigned int order); +void force_page_cache_ra(struct readahead_control *, unsigned long nr); +static inline void force_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read) +{ + DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index); + force_page_cache_ra(&ractl, nr_to_read); +} + +unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +unsigned find_get_entries(struct address_space *mapping, pgoff_t start, + pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); +void filemap_free_folio(struct address_space *mapping, struct folio *folio); +int truncate_inode_folio(struct address_space *mapping, struct folio *folio); +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, + loff_t end); +long invalidate_inode_page(struct page *page); +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); + +/** + * folio_evictable - Test whether a folio is evictable. + * @folio: The folio to test. + * + * Test whether @folio is evictable -- i.e., should be placed on + * active/inactive lists vs unevictable list. + * + * Reasons folio might not be evictable: + * 1. folio's mapping marked unevictable + * 2. One of the pages in the folio is part of an mlocked VMA + */ +static inline bool folio_evictable(struct folio *folio) +{ + bool ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(folio_mapping(folio)) && + !folio_test_mlocked(folio); + rcu_read_unlock(); + return ret; +} + +static inline bool page_evictable(struct page *page) +{ + bool ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + rcu_read_unlock(); + return ret; +} + +/* + * Turn a non-refcounted page (->_refcount == 0) into refcounted with + * a count of one. + */ +static inline void set_page_refcounted(struct page *page) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(page_ref_count(page), page); + set_page_count(page, 1); +} + +/* + * Return true if a folio needs ->release_folio() calling upon it. + */ +static inline bool folio_needs_release(struct folio *folio) +{ + struct address_space *mapping = folio_mapping(folio); + + return folio_has_private(folio) || + (mapping && mapping_release_always(mapping)); +} + +extern unsigned long highest_memmap_pfn; + +/* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* + * in mm/early_ioremap.c + */ +pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, + unsigned long size, pgprot_t prot); + +/* + * in mm/vmscan.c: + */ +int isolate_lru_page(struct page *page); +int folio_isolate_lru(struct folio *folio); +void putback_lru_page(struct page *page); +void folio_putback_lru(struct folio *folio); +extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); + +/* + * in mm/rmap.c: + */ +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); + +/* + * in mm/page_alloc.c + */ + +/* + * Structure for holding the mostly immutable allocation parameters passed + * between functions involved in allocations, including the alloc_pages* + * family of functions. + * + * nodemask, migratetype and highest_zoneidx are initialized only once in + * __alloc_pages() and then never change. + * + * zonelist, preferred_zone and highest_zoneidx are set first in + * __alloc_pages() for the fast path, and might be later changed + * in __alloc_pages_slowpath(). All other functions pass the whole structure + * by a const pointer. + */ +struct alloc_context { + struct zonelist *zonelist; + nodemask_t *nodemask; + struct zoneref *preferred_zoneref; + int migratetype; + + /* + * highest_zoneidx represents highest usable zone index of + * the allocation request. Due to the nature of the zone, + * memory on lower zone than the highest_zoneidx will be + * protected by lowmem_reserve[highest_zoneidx]. + * + * highest_zoneidx is also used by reclaim/compaction to limit + * the target zone since higher zone than this index cannot be + * usable for this allocation request. + */ + enum zone_type highest_zoneidx; + bool spread_dirty_pages; +}; + +/* + * This function returns the order of a free page in the buddy system. In + * general, page_zone(page)->lock must be held by the caller to prevent the + * page from being allocated in parallel and returning garbage as the order. + * If a caller does not hold page_zone(page)->lock, it must guarantee that the + * page cannot be allocated or merged in parallel. Alternatively, it must + * handle invalid values gracefully, and use buddy_order_unsafe() below. + */ +static inline unsigned int buddy_order(struct page *page) +{ + /* PageBuddy() must be checked by the caller */ + return page_private(page); +} + +/* + * Like buddy_order(), but for callers who cannot afford to hold the zone lock. + * PageBuddy() should be checked first by the caller to minimize race window, + * and invalid values must be handled gracefully. + * + * READ_ONCE is used so that if the caller assigns the result into a local + * variable and e.g. tests it for valid range before using, the compiler cannot + * decide to remove the variable and inline the page_private(page) multiple + * times, potentially observing different values in the tests and the actual + * use of the result. + */ +#define buddy_order_unsafe(page) READ_ONCE(page_private(page)) + +/* + * This function checks whether a page is free && is the buddy + * we can coalesce a page and its buddy if + * (a) the buddy is not in a hole (check before calling!) && + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. + * + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. + * + * For recording page's order, we use page_private(page). + */ +static inline bool page_is_buddy(struct page *page, struct page *buddy, + unsigned int order) +{ + if (!page_is_guard(buddy) && !PageBuddy(buddy)) + return false; + + if (buddy_order(buddy) != order) + return false; + + /* + * zone check is done late to avoid uselessly calculating + * zone/node ids for pages that could never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return false; + + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + return true; +} + +/* + * Locate the struct page for both the matching buddy in our + * pair (buddy1) and the combined O(n+1) page they form (page). + * + * 1) Any buddy B1 will have an order O twin B2 which satisfies + * the following equation: + * B2 = B1 ^ (1 << O) + * For example, if the starting buddy (buddy2) is #8 its order + * 1 buddy is #10: + * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 + * + * 2) Any buddy B will have an order O+1 parent P which + * satisfies the following equation: + * P = B & ~(1 << O) + * + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER + */ +static inline unsigned long +__find_buddy_pfn(unsigned long page_pfn, unsigned int order) +{ + return page_pfn ^ (1 << order); +} + +/* + * Find the buddy of @page and validate it. + * @page: The input page + * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the + * function is used in the performance-critical __free_one_page(). + * @order: The order of the page + * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to + * page_to_pfn(). + * + * The found buddy can be a non PageBuddy, out of @page's zone, or its order is + * not the same as @page. The validation is necessary before use it. + * + * Return: the found buddy page or NULL if not found. + */ +static inline struct page *find_buddy_page_pfn(struct page *page, + unsigned long pfn, unsigned int order, unsigned long *buddy_pfn) +{ + unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order); + struct page *buddy; + + buddy = page + (__buddy_pfn - pfn); + if (buddy_pfn) + *buddy_pfn = __buddy_pfn; + + if (page_is_buddy(page, buddy, order)) + return buddy; + return NULL; +} + +extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone); + +static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + if (zone->contiguous) + return pfn_to_page(start_pfn); + + return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); +} + +extern int __isolate_free_page(struct page *page, unsigned int order); +extern void __putback_isolated_page(struct page *page, unsigned int order, + int mt); +extern void memblock_free_pages(struct page *page, unsigned long pfn, + unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned int order); +extern void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags); +extern int user_min_free_kbytes; + +extern void free_unref_page(struct page *page, unsigned int order); +extern void free_unref_page_list(struct list_head *list); + +extern void zone_pcp_reset(struct zone *zone); +extern void zone_pcp_disable(struct zone *zone); +extern void zone_pcp_enable(struct zone *zone); + +extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + int nid, bool exact_nid); + +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset); + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + +/* + * in mm/compaction.c + */ +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned int nr_freepages; /* Number of isolated free pages */ + unsigned int nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + /* + * Acts as an in/out parameter to page isolation for migration. + * isolate_migratepages uses it as a search base. + * isolate_migratepages_block will update the value to the next pfn + * after the last isolated one. + */ + unsigned long migrate_pfn; + unsigned long fast_start_pfn; /* a pfn to start linear scan from */ + struct zone *zone; + unsigned long total_migrate_scanned; + unsigned long total_free_scanned; + unsigned short fast_search_fail;/* failures to use free list searches */ + short search_order; /* order to start a fast search at */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + int order; /* order a direct compactor needs */ + int migratetype; /* migratetype of direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ + const int highest_zoneidx; /* zone index of a direct compactor */ + enum migrate_mode mode; /* Async or sync migration mode */ + bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool no_set_skip_hint; /* Don't mark blocks for skipping */ + bool ignore_block_suitable; /* Scan blocks considered unsuitable */ + bool direct_compaction; /* False from kcompactd or /proc/... */ + bool proactive_compaction; /* kcompactd proactive compaction */ + bool whole_zone; /* Whole zone should/has been scanned */ + bool contended; /* Signal lock contention */ + bool rescan; /* Rescanning the same pageblock */ + bool alloc_contig; /* alloc_contig_range allocation */ +}; + +/* + * Used in direct compaction when a page should be taken from the freelists + * immediately when one is created during the free path. + */ +struct capture_control { + struct compact_control *cc; + struct page *page; +}; + +unsigned long +isolate_freepages_range(struct compact_control *cc, + unsigned long start_pfn, unsigned long end_pfn); +int +isolate_migratepages_range(struct compact_control *cc, + unsigned long low_pfn, unsigned long end_pfn); + +int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end); +#endif +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal); + +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; +} + +/* + * Stack area - automatically grows in one direction + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return (flags & VM_STACK) == VM_STACK; +} + +/* + * Data area - private, writable, not stack + */ +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; +} + +/* mm/util.c */ +struct anon_vma *folio_anon_vma(struct folio *folio); + +#ifdef CONFIG_MMU +void unmap_mapping_folio(struct folio *folio); +extern long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *locked); +extern long faultin_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + bool write, int *locked); +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); +/* + * mlock_vma_page() and munlock_vma_page(): + * should be called with vma's mmap_lock held for read or write, + * under page table lock for the pte/pmd being added or removed. + * + * mlock is usually called at the end of page_add_*_rmap(), + * munlock at the end of page_remove_rmap(); but new anon + * pages are managed by lru_cache_add_inactive_or_unevictable() + * calling mlock_new_page(). + * + * @compound is used to include pmd mappings of THPs, but filter out + * pte mappings of THPs, which cannot be consistently counted: a pte + * mapping of the THP head cannot be distinguished by the page alone. + */ +void mlock_folio(struct folio *folio); +static inline void mlock_vma_folio(struct folio *folio, + struct vm_area_struct *vma, bool compound) +{ + /* + * The VM_SPECIAL check here serves two purposes. + * 1) VM_IO check prevents migration from double-counting during mlock. + * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED + * is never left set on a VM_SPECIAL vma, there is an interval while + * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may + * still be set while VM_SPECIAL bits are added: so ignore it then. + */ + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && + (compound || !folio_test_large(folio))) + mlock_folio(folio); +} + +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + mlock_vma_folio(page_folio(page), vma, compound); +} + +void munlock_page(struct page *page); +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + if (unlikely(vma->vm_flags & VM_LOCKED) && + (compound || !PageTransCompound(page))) + munlock_page(page); +} +void mlock_new_page(struct page *page); +bool need_mlock_page_drain(int cpu); +void mlock_page_drain_local(void); +void mlock_page_drain_remote(int cpu); + +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + +/* + * Return the start of user virtual address at the specific offset within + * a vma. + */ +static inline unsigned long +vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages, + struct vm_area_struct *vma) +{ + unsigned long address; + + if (pgoff >= vma->vm_pgoff) { + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address >= vma->vm_end) + address = -EFAULT; + } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) { + /* Test above avoids possibility of wrap to 0 on 32-bit */ + address = vma->vm_start; + } else { + address = -EFAULT; + } + return address; +} + +/* + * Return the start of user virtual address of a page within a vma. + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. + */ +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma); +} + +/* + * Then at what user virtual address will none of the range be found in vma? + * Assumes that vma_address() already returned a good starting address. + */ +static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw) +{ + struct vm_area_struct *vma = pvmw->vma; + pgoff_t pgoff; + unsigned long address; + + /* Common case, plus ->pgoff is invalid for KSM */ + if (pvmw->nr_pages == 1) + return pvmw->address + PAGE_SIZE; + + pgoff = pvmw->pgoff + pvmw->nr_pages; + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address > vma->vm_end) + address = vma->vm_end; + return address; +} + +static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_lock if only + * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt. + */ + if (fault_flag_allow_retry_first(flags) && + !(flags & FAULT_FLAG_RETRY_NOWAIT)) { + fpin = get_file(vmf->vma->vm_file); + mmap_read_unlock(vmf->vma->vm_mm); + } + return fpin; +} +#else /* !CONFIG_MMU */ +static inline void unmap_mapping_folio(struct folio *folio) { } +static inline void mlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void munlock_vma_page(struct page *page, + struct vm_area_struct *vma, bool compound) { } +static inline void mlock_new_page(struct page *page) { } +static inline bool need_mlock_page_drain(int cpu) { return false; } +static inline void mlock_page_drain_local(void) { } +static inline void mlock_page_drain_remote(int cpu) { } +static inline void vunmap_range_noflush(unsigned long start, unsigned long end) +{ +} +#endif /* !CONFIG_MMU */ + +/* Memory initialisation debug and verification */ +enum mminit_level { + MMINIT_WARNING, + MMINIT_VERIFY, + MMINIT_TRACE +}; + +#ifdef CONFIG_DEBUG_MEMORY_INIT + +extern int mminit_loglevel; + +#define mminit_dprintk(level, prefix, fmt, arg...) \ +do { \ + if (level < mminit_loglevel) { \ + if (level <= MMINIT_WARNING) \ + pr_warn("mminit::" prefix " " fmt, ##arg); \ + else \ + printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ + } \ +} while (0) + +extern void mminit_verify_pageflags_layout(void); +extern void mminit_verify_zonelist(void); +#else + +static inline void mminit_dprintk(enum mminit_level level, + const char *prefix, const char *fmt, ...) +{ +} + +static inline void mminit_verify_pageflags_layout(void) +{ +} + +static inline void mminit_verify_zonelist(void) +{ +} +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +#define NODE_RECLAIM_NOSCAN -2 +#define NODE_RECLAIM_FULL -1 +#define NODE_RECLAIM_SOME 0 +#define NODE_RECLAIM_SUCCESS 1 + +#ifdef CONFIG_NUMA +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +extern int find_next_best_node(int node, nodemask_t *used_node_mask); +#else +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) +{ + return NODE_RECLAIM_NOSCAN; +} +static inline int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + return NUMA_NO_NODE; +} +#endif + +/* + * mm/memory-failure.c + */ +extern int hwpoison_filter(struct page *p); + +extern u32 hwpoison_filter_dev_major; +extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; +extern u64 hwpoison_filter_memcg; +extern u32 hwpoison_filter_enable; + +#ifdef CONFIG_MEMORY_FAILURE +void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); +#else +static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ +} +#endif + +extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + +extern void set_pageblock_order(void); +unsigned int reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *page_list); +/* The ALLOC_WMARK bits are used as an index to zone->watermark */ +#define ALLOC_WMARK_MIN WMARK_MIN +#define ALLOC_WMARK_LOW WMARK_LOW +#define ALLOC_WMARK_HIGH WMARK_HIGH +#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ + +/* Mask to get the watermark bits */ +#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) + +/* + * Only MMU archs have async oom victim reclaim - aka oom_reaper so we + * cannot assume a reduced access to memory reserves is sufficient for + * !MMU + */ +#ifdef CONFIG_MMU +#define ALLOC_OOM 0x08 +#else +#define ALLOC_OOM ALLOC_NO_WATERMARKS +#endif + +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#ifdef CONFIG_ZONE_DMA32 +#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ +#else +#define ALLOC_NOFRAGMENT 0x0 +#endif +#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ + +enum ttu_flags; +struct tlbflush_unmap_batch; + + +/* + * only for MM internal work items which do not depend on + * any allocations or locks which might depend on allocations + */ +extern struct workqueue_struct *mm_percpu_wq; + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +void try_to_unmap_flush(void); +void try_to_unmap_flush_dirty(void); +void flush_tlb_batched_pending(struct mm_struct *mm); +#else +static inline void try_to_unmap_flush(void) +{ +} +static inline void try_to_unmap_flush_dirty(void) +{ +} +static inline void flush_tlb_batched_pending(struct mm_struct *mm) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +extern const struct trace_print_flags pageflag_names[]; +extern const struct trace_print_flags vmaflag_names[]; +extern const struct trace_print_flags gfpflag_names[]; + +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + +void setup_zone_pageset(struct zone *zone); + +struct migration_target_control { + int nid; /* preferred node id */ + nodemask_t *nmask; + gfp_t gfp_mask; +}; + +/* + * mm/vmalloc.c + */ +#ifdef CONFIG_MMU +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift); +#else +static inline +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + return -EINVAL; +} +#endif + +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + +void vunmap_range_noflush(unsigned long start, unsigned long end); + +void __vunmap_range_noflush(unsigned long start, unsigned long end); + +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags); + +void free_zone_device_page(struct page *page); +int migrate_device_coherent_page(struct page *page); + +/* + * mm/gup.c + */ +struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); + +extern bool mirrored_kernelcore; + +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) +{ + /* + * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty + * enablements, because when without soft-dirty being compiled in, + * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) + * will be constantly true. + */ + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + return false; + + /* + * Soft-dirty is kind of special: its tracking is enabled when the + * vma flags not set. + */ + return !(vma->vm_flags & VM_SOFTDIRTY); +} + +#endif /* __MM_INTERNAL_H */ diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000..32e390c42 --- /dev/null +++ b/mm/interval_tree.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/interval_tree.c - interval tree for mapping->i_mmap + * + * Copyright (C) 2012, Michel Lespinasse + */ + +#include +#include +#include +#include + +static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff; +} + +static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) +{ + return v->vm_pgoff + vma_pages(v) - 1; +} + +INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, + unsigned long, shared.rb_subtree_last, + vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree) + +/* Insert node immediately after prev in the interval tree */ +void vma_interval_tree_insert_after(struct vm_area_struct *node, + struct vm_area_struct *prev, + struct rb_root_cached *root) +{ + struct rb_node **link; + struct vm_area_struct *parent; + unsigned long last = vma_last_pgoff(node); + + VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); + + if (!prev->shared.rb.rb_right) { + parent = prev; + link = &prev->shared.rb.rb_right; + } else { + parent = rb_entry(prev->shared.rb.rb_right, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + while (parent->shared.rb.rb_left) { + parent = rb_entry(parent->shared.rb.rb_left, + struct vm_area_struct, shared.rb); + if (parent->shared.rb_subtree_last < last) + parent->shared.rb_subtree_last = last; + } + link = &parent->shared.rb.rb_left; + } + + node->shared.rb_subtree_last = last; + rb_link_node(&node->shared.rb, &parent->shared.rb, link); + rb_insert_augmented(&node->shared.rb, &root->rb_root, + &vma_interval_tree_augment); +} + +static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) +{ + return vma_start_pgoff(avc->vma); +} + +static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) +{ + return vma_last_pgoff(avc->vma); +} + +INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, + avc_start_pgoff, avc_last_pgoff, + static inline, __anon_vma_interval_tree) + +void anon_vma_interval_tree_insert(struct anon_vma_chain *node, + struct rb_root_cached *root) +{ +#ifdef CONFIG_DEBUG_VM_RB + node->cached_vma_start = avc_start_pgoff(node); + node->cached_vma_last = avc_last_pgoff(node); +#endif + __anon_vma_interval_tree_insert(node, root); +} + +void anon_vma_interval_tree_remove(struct anon_vma_chain *node, + struct rb_root_cached *root) +{ + __anon_vma_interval_tree_remove(node, root); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_first(struct rb_root_cached *root, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_first(root, first, last); +} + +struct anon_vma_chain * +anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, + unsigned long first, unsigned long last) +{ + return __anon_vma_interval_tree_iter_next(node, first, last); +} + +#ifdef CONFIG_DEBUG_VM_RB +void anon_vma_interval_tree_verify(struct anon_vma_chain *node) +{ + WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); + WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); +} +#endif diff --git a/mm/io-mapping.c b/mm/io-mapping.c new file mode 100644 index 000000000..01b362799 --- /dev/null +++ b/mm/io-mapping.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +/** + * io_mapping_map_user - remap an I/O mapping to userspace + * @iomap: the source io_mapping + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * + * Note: this is only safe if the mm semaphore is held when called. + */ +int io_mapping_map_user(struct io_mapping *iomap, struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, unsigned long size) +{ + vm_flags_t expected_flags = VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + + if (WARN_ON_ONCE((vma->vm_flags & expected_flags) != expected_flags)) + return -EINVAL; + + /* We rely on prevalidation of the io-mapping to skip track_pfn(). */ + return remap_pfn_range_notrack(vma, addr, pfn, size, + __pgprot((pgprot_val(iomap->prot) & _PAGE_CACHE_MASK) | + (pgprot_val(vma->vm_page_prot) & ~_PAGE_CACHE_MASK))); +} +EXPORT_SYMBOL_GPL(io_mapping_map_user); diff --git a/mm/ioremap.c b/mm/ioremap.c new file mode 100644 index 000000000..865242628 --- /dev/null +++ b/mm/ioremap.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ +#include +#include +#include +#include + +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) +{ + unsigned long offset, vaddr; + phys_addr_t last_addr; + struct vm_struct *area; + + /* Disallow wrap-around or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* Page-align mappings */ + offset = phys_addr & (~PAGE_MASK); + phys_addr -= offset; + size = PAGE_ALIGN(size + offset); + + if (!ioremap_allowed(phys_addr, size, prot)) + return NULL; + + area = get_vm_area_caller(size, VM_IOREMAP, + __builtin_return_address(0)); + if (!area) + return NULL; + vaddr = (unsigned long)area->addr; + area->phys_addr = phys_addr; + + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, + __pgprot(prot))) { + free_vm_area(area); + return NULL; + } + + return (void __iomem *)(vaddr + offset); +} +EXPORT_SYMBOL(ioremap_prot); + +void iounmap(volatile void __iomem *addr) +{ + void *vaddr = (void *)((unsigned long)addr & PAGE_MASK); + + if (!iounmap_allowed(vaddr)) + return; + + if (is_vmalloc_addr(vaddr)) + vunmap(vaddr); +} +EXPORT_SYMBOL(iounmap); diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile new file mode 100644 index 000000000..d4837bff3 --- /dev/null +++ b/mm/kasan/Makefile @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 +KASAN_SANITIZE := n +UBSAN_SANITIZE := n +KCOV_INSTRUMENT := n + +# Disable ftrace to avoid recursion. +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE) + +# Function splitter causes unnecessary splits in __asan_load1/__asan_store1 +# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 +CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack) +CC_FLAGS_KASAN_RUNTIME += -fno-stack-protector +# Disable branch tracing to avoid recursion. +CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) + +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) + +CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) +CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) + +obj-y := common.o report.o +obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o +obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o +obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o + +obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o +obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c new file mode 100644 index 000000000..21e66d7f2 --- /dev/null +++ b/mm/kasan/common.c @@ -0,0 +1,452 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common KASAN code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" +#include "../slab.h" + +struct slab *kasan_addr_to_slab(const void *addr) +{ + if (virt_addr_valid(addr)) + return virt_to_slab(addr); + return NULL; +} + +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); +} + +void kasan_set_track(struct kasan_track *track, gfp_t flags) +{ + track->pid = current->pid; + track->stack = kasan_save_stack(flags, true); +} + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +void kasan_enable_current(void) +{ + current->kasan_depth++; +} +EXPORT_SYMBOL(kasan_enable_current); + +void kasan_disable_current(void) +{ + current->kasan_depth--; +} +EXPORT_SYMBOL(kasan_disable_current); + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void __kasan_unpoison_range(const void *address, size_t size) +{ + kasan_unpoison(address, size, false); +} + +#ifdef CONFIG_KASAN_STACK +/* Unpoison the entire stack for a task. */ +void kasan_unpoison_task_stack(struct task_struct *task) +{ + void *base = task_stack_page(task); + + kasan_unpoison(base, THREAD_SIZE, false); +} + +/* Unpoison the stack for the current task beyond a watermark sp value. */ +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) +{ + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison(base, watermark - base, false); +} +#endif /* CONFIG_KASAN_STACK */ + +void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) +{ + u8 tag; + unsigned long i; + + if (unlikely(PageHighMem(page))) + return; + + tag = kasan_random_tag(); + kasan_unpoison(set_tag(page_address(page), tag), + PAGE_SIZE << order, init); + for (i = 0; i < (1 << order); i++) + page_kasan_tag_set(page + i, tag); +} + +void __kasan_poison_pages(struct page *page, unsigned int order, bool init) +{ + if (likely(!PageHighMem(page))) + kasan_poison(page_address(page), PAGE_SIZE << order, + KASAN_PAGE_FREE, init); +} + +void __kasan_cache_create_kmalloc(struct kmem_cache *cache) +{ + cache->kasan_info.is_kmalloc = true; +} + +void __kasan_poison_slab(struct slab *slab) +{ + struct page *page = slab_page(slab); + unsigned long i; + + for (i = 0; i < compound_nr(page); i++) + page_kasan_tag_reset(page + i); + kasan_poison(page_address(page), page_size(page), + KASAN_SLAB_REDZONE, false); +} + +void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) +{ + kasan_unpoison(object, cache->object_size, false); +} + +void __kasan_poison_object_data(struct kmem_cache *cache, void *object) +{ + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_SLAB_REDZONE, false); +} + +/* + * This function assigns a tag to an object considering the following: + * 1. A cache might have a constructor, which might save a pointer to a slab + * object somewhere (e.g. in the object itself). We preassign a tag for + * each object in caches with constructors during slab creation and reuse + * the same tag each time a particular object is allocated. + * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be + * accessed after being freed. We preassign tags for objects in these + * caches as well. + * 3. For SLAB allocator we can't preassign tags randomly since the freelist + * is stored as an array of indexes instead of a linked list. Assign tags + * based on objects indexes, so that objects that are next to each other + * get different tags. + */ +static inline u8 assign_tag(struct kmem_cache *cache, + const void *object, bool init) +{ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + return 0xff; + + /* + * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU + * set, assign a tag when the object is being allocated (init == false). + */ + if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) + return init ? KASAN_TAG_KERNEL : kasan_random_tag(); + + /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ +#ifdef CONFIG_SLAB + /* For SLAB assign tags based on the object index in the freelist. */ + return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object); +#else + /* + * For SLUB assign a random tag during slab creation, otherwise reuse + * the already assigned tag. + */ + return init ? kasan_random_tag() : get_tag(object); +#endif +} + +void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, + const void *object) +{ + /* Initialize per-object metadata if it is present. */ + if (kasan_requires_meta()) + kasan_init_object_meta(cache, object); + + /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ + object = set_tag(object, assign_tag(cache, object, true)); + + return (void *)object; +} + +static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool quarantine, bool init) +{ + void *tagged_object; + + if (!kasan_arch_is_ready()) + return false; + + tagged_object = object; + object = kasan_reset_tag(object); + + if (is_kfence_address(object)) + return false; + + if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != + object)) { + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE); + return true; + } + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) + return false; + + if (!kasan_byte_accessible(tagged_object)) { + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); + return true; + } + + kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE), + KASAN_SLAB_FREE, init); + + if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) + return false; + + if (kasan_stack_collection_enabled()) + kasan_save_free_info(cache, tagged_object); + + return kasan_quarantine_put(cache, object); +} + +bool __kasan_slab_free(struct kmem_cache *cache, void *object, + unsigned long ip, bool init) +{ + return ____kasan_slab_free(cache, object, ip, true, init); +} + +static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) +{ + if (!kasan_arch_is_ready()) + return false; + + if (ptr != page_address(virt_to_head_page(ptr))) { + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); + return true; + } + + if (!kasan_byte_accessible(ptr)) { + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE); + return true; + } + + /* + * The object will be poisoned by kasan_poison_pages() or + * kasan_slab_free_mempool(). + */ + + return false; +} + +void __kasan_kfree_large(void *ptr, unsigned long ip) +{ + ____kasan_kfree_large(ptr, ip); +} + +void __kasan_slab_free_mempool(void *ptr, unsigned long ip) +{ + struct folio *folio; + + folio = virt_to_folio(ptr); + + /* + * Even though this function is only called for kmem_cache_alloc and + * kmalloc backed mempool allocations, those allocations can still be + * !PageSlab() when the size provided to kmalloc is larger than + * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. + */ + if (unlikely(!folio_test_slab(folio))) { + if (____kasan_kfree_large(ptr, ip)) + return; + kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false); + } else { + struct slab *slab = folio_slab(folio); + + ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false); + } +} + +void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, + void *object, gfp_t flags, bool init) +{ + u8 tag; + void *tagged_object; + + if (gfpflags_allow_blocking(flags)) + kasan_quarantine_reduce(); + + if (unlikely(object == NULL)) + return NULL; + + if (is_kfence_address(object)) + return (void *)object; + + /* + * Generate and assign random tag for tag-based modes. + * Tag is ignored in set_tag() for the generic mode. + */ + tag = assign_tag(cache, object, false); + tagged_object = set_tag(object, tag); + + /* + * Unpoison the whole object. + * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning. + */ + kasan_unpoison(tagged_object, cache->object_size, init); + + /* Save alloc info (if possible) for non-kmalloc() allocations. */ + if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + kasan_save_alloc_info(cache, tagged_object, flags); + + return tagged_object; +} + +static inline void *____kasan_kmalloc(struct kmem_cache *cache, + const void *object, size_t size, gfp_t flags) +{ + unsigned long redzone_start; + unsigned long redzone_end; + + if (gfpflags_allow_blocking(flags)) + kasan_quarantine_reduce(); + + if (unlikely(object == NULL)) + return NULL; + + if (is_kfence_address(kasan_reset_tag(object))) + return (void *)object; + + /* + * The object has already been unpoisoned by kasan_slab_alloc() for + * kmalloc() or by kasan_krealloc() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule((void *)object, size); + + /* Poison the aligned part of the redzone. */ + redzone_start = round_up((unsigned long)(object + size), + KASAN_GRANULE_SIZE); + redzone_end = round_up((unsigned long)(object + cache->object_size), + KASAN_GRANULE_SIZE); + kasan_poison((void *)redzone_start, redzone_end - redzone_start, + KASAN_SLAB_REDZONE, false); + + /* + * Save alloc info (if possible) for kmalloc() allocations. + * This also rewrites the alloc info when called from kasan_krealloc(). + */ + if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + kasan_save_alloc_info(cache, (void *)object, flags); + + /* Keep the tag that was set by kasan_slab_alloc(). */ + return (void *)object; +} + +void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags) +{ + return ____kasan_kmalloc(cache, object, size, flags); +} +EXPORT_SYMBOL(__kasan_kmalloc); + +void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) +{ + unsigned long redzone_start; + unsigned long redzone_end; + + if (gfpflags_allow_blocking(flags)) + kasan_quarantine_reduce(); + + if (unlikely(ptr == NULL)) + return NULL; + + /* + * The object has already been unpoisoned by kasan_unpoison_pages() for + * alloc_pages() or by kasan_krealloc() for krealloc(). + */ + + /* + * The redzone has byte-level precision for the generic mode. + * Partially poison the last object granule to cover the unaligned + * part of the redzone. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(ptr, size); + + /* Poison the aligned part of the redzone. */ + redzone_start = round_up((unsigned long)(ptr + size), + KASAN_GRANULE_SIZE); + redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr)); + kasan_poison((void *)redzone_start, redzone_end - redzone_start, + KASAN_PAGE_REDZONE, false); + + return (void *)ptr; +} + +void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags) +{ + struct slab *slab; + + if (unlikely(object == ZERO_SIZE_PTR)) + return (void *)object; + + /* + * Unpoison the object's data. + * Part of it might already have been unpoisoned, but it's unknown + * how big that part is. + */ + kasan_unpoison(object, size, false); + + slab = virt_to_slab(object); + + /* Piggy-back on kmalloc() instrumentation to poison the redzone. */ + if (unlikely(!slab)) + return __kasan_kmalloc_large(object, size, flags); + else + return ____kasan_kmalloc(slab->slab_cache, object, size, flags); +} + +bool __kasan_check_byte(const void *address, unsigned long ip) +{ + if (!kasan_byte_accessible(address)) { + kasan_report((unsigned long)address, 1, false, ip); + return false; + } + return true; +} diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c new file mode 100644 index 000000000..4967988fb --- /dev/null +++ b/mm/kasan/generic.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core generic KASAN code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" +#include "../slab.h" + +/* + * All functions below always inlined so compiler could + * perform better optimizations in each of __asan_loadX/__assn_storeX + * depending on memory access size X. + */ + +static __always_inline bool memory_is_poisoned_1(unsigned long addr) +{ + s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); + + if (unlikely(shadow_value)) { + s8 last_accessible_byte = addr & KASAN_GRANULE_MASK; + return unlikely(last_accessible_byte >= shadow_value); + } + + return false; +} + +static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, + unsigned long size) +{ + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); + + /* + * Access crosses 8(shadow size)-byte boundary. Such access maps + * into 2 shadow bytes, so we need to check them both. + */ + if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) + return *shadow_addr || memory_is_poisoned_1(addr + size - 1); + + return memory_is_poisoned_1(addr + size - 1); +} + +static __always_inline bool memory_is_poisoned_16(unsigned long addr) +{ + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + + /* Unaligned 16-bytes access maps into 3 shadow bytes. */ + if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE))) + return *shadow_addr || memory_is_poisoned_1(addr + 15); + + return *shadow_addr; +} + +static __always_inline unsigned long bytes_is_nonzero(const u8 *start, + size_t size) +{ + while (size) { + if (unlikely(*start)) + return (unsigned long)start; + start++; + size--; + } + + return 0; +} + +static __always_inline unsigned long memory_is_nonzero(const void *start, + const void *end) +{ + unsigned int words; + unsigned long ret; + unsigned int prefix = (unsigned long)start % 8; + + if (end - start <= 16) + return bytes_is_nonzero(start, end - start); + + if (prefix) { + prefix = 8 - prefix; + ret = bytes_is_nonzero(start, prefix); + if (unlikely(ret)) + return ret; + start += prefix; + } + + words = (end - start) / 8; + while (words) { + if (unlikely(*(u64 *)start)) + return bytes_is_nonzero(start, 8); + start += 8; + words--; + } + + return bytes_is_nonzero(start, (end - start) % 8); +} + +static __always_inline bool memory_is_poisoned_n(unsigned long addr, + size_t size) +{ + unsigned long ret; + + ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), + kasan_mem_to_shadow((void *)addr + size - 1) + 1); + + if (unlikely(ret)) { + unsigned long last_byte = addr + size - 1; + s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); + + if (unlikely(ret != (unsigned long)last_shadow || + ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) + return true; + } + return false; +} + +static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) +{ + if (__builtin_constant_p(size)) { + switch (size) { + case 1: + return memory_is_poisoned_1(addr); + case 2: + case 4: + case 8: + return memory_is_poisoned_2_4_8(addr, size); + case 16: + return memory_is_poisoned_16(addr); + default: + BUILD_BUG(); + } + } + + return memory_is_poisoned_n(addr, size); +} + +static __always_inline bool check_region_inline(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) +{ + if (!kasan_arch_is_ready()) + return true; + + if (unlikely(size == 0)) + return true; + + if (unlikely(addr + size < addr)) + return !kasan_report(addr, size, write, ret_ip); + + if (unlikely((void *)addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + return !kasan_report(addr, size, write, ret_ip); + } + + if (likely(!memory_is_poisoned(addr, size))) + return true; + + return !kasan_report(addr, size, write, ret_ip); +} + +bool kasan_check_range(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) +{ + return check_region_inline(addr, size, write, ret_ip); +} + +bool kasan_byte_accessible(const void *addr) +{ + s8 shadow_byte; + + if (!kasan_arch_is_ready()) + return true; + + shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); + + return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; +} + +void kasan_cache_shrink(struct kmem_cache *cache) +{ + kasan_quarantine_remove_cache(cache); +} + +void kasan_cache_shutdown(struct kmem_cache *cache) +{ + if (!__kmem_cache_empty(cache)) + kasan_quarantine_remove_cache(cache); +} + +static void register_global(struct kasan_global *global) +{ + size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); + + kasan_unpoison(global->beg, global->size, false); + + kasan_poison(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE, false); +} + +void __asan_register_globals(struct kasan_global *globals, size_t size) +{ + int i; + + for (i = 0; i < size; i++) + register_global(&globals[i]); +} +EXPORT_SYMBOL(__asan_register_globals); + +void __asan_unregister_globals(struct kasan_global *globals, size_t size) +{ +} +EXPORT_SYMBOL(__asan_unregister_globals); + +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_region_inline(addr, size, false, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_region_inline(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_store##size##_noabort) + +DEFINE_ASAN_LOAD_STORE(1); +DEFINE_ASAN_LOAD_STORE(2); +DEFINE_ASAN_LOAD_STORE(4); +DEFINE_ASAN_LOAD_STORE(8); +DEFINE_ASAN_LOAD_STORE(16); + +void __asan_loadN(unsigned long addr, size_t size) +{ + kasan_check_range(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_loadN); + +__alias(__asan_loadN) +void __asan_loadN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_loadN_noabort); + +void __asan_storeN(unsigned long addr, size_t size) +{ + kasan_check_range(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_storeN); + +__alias(__asan_storeN) +void __asan_storeN_noabort(unsigned long, size_t); +EXPORT_SYMBOL(__asan_storeN_noabort); + +/* to shut up compiler complaints */ +void __asan_handle_no_return(void) {} +EXPORT_SYMBOL(__asan_handle_no_return); + +/* Emitted by compiler to poison alloca()ed objects. */ +void __asan_alloca_poison(unsigned long addr, size_t size) +{ + size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE); + size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - + rounded_up_size; + size_t rounded_down_size = round_down(size, KASAN_GRANULE_SIZE); + + const void *left_redzone = (const void *)(addr - + KASAN_ALLOCA_REDZONE_SIZE); + const void *right_redzone = (const void *)(addr + rounded_up_size); + + WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); + + kasan_unpoison((const void *)(addr + rounded_down_size), + size - rounded_down_size, false); + kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT, false); + kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT, false); +} +EXPORT_SYMBOL(__asan_alloca_poison); + +/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */ +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) +{ + if (unlikely(!stack_top || stack_top > stack_bottom)) + return; + + kasan_unpoison(stack_top, stack_bottom - stack_top, false); +} +EXPORT_SYMBOL(__asan_allocas_unpoison); + +/* Emitted by the compiler to [un]poison local variables. */ +#define DEFINE_ASAN_SET_SHADOW(byte) \ + void __asan_set_shadow_##byte(const void *addr, size_t size) \ + { \ + __memset((void *)addr, 0x##byte, size); \ + } \ + EXPORT_SYMBOL(__asan_set_shadow_##byte) + +DEFINE_ASAN_SET_SHADOW(00); +DEFINE_ASAN_SET_SHADOW(f1); +DEFINE_ASAN_SET_SHADOW(f2); +DEFINE_ASAN_SET_SHADOW(f3); +DEFINE_ASAN_SET_SHADOW(f5); +DEFINE_ASAN_SET_SHADOW(f8); + +/* Only allow cache merging when no per-object metadata is present. */ +slab_flags_t kasan_never_merge(void) +{ + if (!kasan_requires_meta()) + return 0; + return SLAB_KASAN; +} + +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static inline unsigned int optimal_redzone(unsigned int object_size) +{ + return + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; +} + +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) +{ + unsigned int ok_size; + unsigned int optimal_size; + + if (!kasan_requires_meta()) + return; + + /* + * SLAB_KASAN is used to mark caches that are sanitized by KASAN + * and that thus have per-object metadata. + * Currently this flag is used in two places: + * 1. In slab_ksize() to account for per-object metadata when + * calculating the size of the accessible memory within the object. + * 2. In slab_common.c via kasan_never_merge() to prevent merging of + * caches with per-object metadata. + */ + *flags |= SLAB_KASAN; + + ok_size = *size; + + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. + */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.alloc_meta_offset = 0; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; +} + +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) +{ + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return (void *)object + cache->kasan_info.free_meta_offset; +} + +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + if (!kasan_requires_meta()) + return 0; + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); +} + +static void __kasan_record_aux_stack(void *addr, bool can_alloc) +{ + struct slab *slab = kasan_addr_to_slab(addr); + struct kmem_cache *cache; + struct kasan_alloc_meta *alloc_meta; + void *object; + + if (is_kfence_address(addr) || !slab) + return; + + cache = slab->slab_cache; + object = nearest_obj(cache, slab, addr); + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; + alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc); +} + +void kasan_record_aux_stack(void *addr) +{ + return __kasan_record_aux_stack(addr, true); +} + +void kasan_record_aux_stack_noalloc(void *addr) +{ + return __kasan_record_aux_stack(addr, false); +} + +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + +void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + struct kasan_free_meta *free_meta; + + free_meta = kasan_get_free_meta(cache, object); + if (!free_meta) + return; + + kasan_set_track(&free_meta->free_track, GFP_NOWAIT); + /* The object was freed and has free track set. */ + *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; +} diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c new file mode 100644 index 000000000..cc9bc99e4 --- /dev/null +++ b/mm/kasan/hw_tags.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core hardware tag-based KASAN code. + * + * Copyright (c) 2020 Google, Inc. + * Author: Andrey Konovalov + */ + +#define pr_fmt(fmt) "kasan: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" + +enum kasan_arg { + KASAN_ARG_DEFAULT, + KASAN_ARG_OFF, + KASAN_ARG_ON, +}; + +enum kasan_arg_mode { + KASAN_ARG_MODE_DEFAULT, + KASAN_ARG_MODE_SYNC, + KASAN_ARG_MODE_ASYNC, + KASAN_ARG_MODE_ASYMM, +}; + +enum kasan_arg_vmalloc { + KASAN_ARG_VMALLOC_DEFAULT, + KASAN_ARG_VMALLOC_OFF, + KASAN_ARG_VMALLOC_ON, +}; + +static enum kasan_arg kasan_arg __ro_after_init; +static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; + +/* + * Whether KASAN is enabled at all. + * The value remains false until KASAN is initialized by kasan_init_hw_tags(). + */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); +EXPORT_SYMBOL(kasan_flag_enabled); + +/* + * Whether the selected mode is synchronous, asynchronous, or asymmetric. + * Defaults to KASAN_MODE_SYNC. + */ +enum kasan_mode kasan_mode __ro_after_init; +EXPORT_SYMBOL_GPL(kasan_mode); + +/* Whether to enable vmalloc tagging. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + +/* kasan=off/on */ +static int __init early_kasan_flag(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg = KASAN_ARG_OFF; + else if (!strcmp(arg, "on")) + kasan_arg = KASAN_ARG_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan", early_kasan_flag); + +/* kasan.mode=sync/async/asymm */ +static int __init early_kasan_mode(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "sync")) + kasan_arg_mode = KASAN_ARG_MODE_SYNC; + else if (!strcmp(arg, "async")) + kasan_arg_mode = KASAN_ARG_MODE_ASYNC; + else if (!strcmp(arg, "asymm")) + kasan_arg_mode = KASAN_ARG_MODE_ASYMM; + else + return -EINVAL; + + return 0; +} +early_param("kasan.mode", early_kasan_mode); + +/* kasan.vmalloc=off/on */ +static int __init early_kasan_flag_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.vmalloc", early_kasan_flag_vmalloc); + +static inline const char *kasan_mode_info(void) +{ + if (kasan_mode == KASAN_MODE_ASYNC) + return "async"; + else if (kasan_mode == KASAN_MODE_ASYMM) + return "asymm"; + else + return "sync"; +} + +/* + * kasan_init_hw_tags_cpu() is called for each CPU. + * Not marked as __init as a CPU can be hot-plugged after boot. + */ +void kasan_init_hw_tags_cpu(void) +{ + /* + * There's no need to check that the hardware is MTE-capable here, + * as this function is only called for MTE-capable hardware. + */ + + /* + * If KASAN is disabled via command line, don't initialize it. + * When this function is called, kasan_flag_enabled is not yet + * set by kasan_init_hw_tags(). Thus, check kasan_arg instead. + */ + if (kasan_arg == KASAN_ARG_OFF) + return; + + /* + * Enable async or asymm modes only when explicitly requested + * through the command line. + */ + kasan_enable_tagging(); +} + +/* kasan_init_hw_tags() is called once on boot CPU. */ +void __init kasan_init_hw_tags(void) +{ + /* If hardware doesn't support MTE, don't initialize KASAN. */ + if (!system_supports_mte()) + return; + + /* If KASAN is disabled via command line, don't initialize it. */ + if (kasan_arg == KASAN_ARG_OFF) + return; + + switch (kasan_arg_mode) { + case KASAN_ARG_MODE_DEFAULT: + /* Default is specified by kasan_mode definition. */ + break; + case KASAN_ARG_MODE_SYNC: + kasan_mode = KASAN_MODE_SYNC; + break; + case KASAN_ARG_MODE_ASYNC: + kasan_mode = KASAN_MODE_ASYNC; + break; + case KASAN_ARG_MODE_ASYMM: + kasan_mode = KASAN_MODE_ASYMM; + break; + } + + switch (kasan_arg_vmalloc) { + case KASAN_ARG_VMALLOC_DEFAULT: + /* Default is specified by kasan_flag_vmalloc definition. */ + break; + case KASAN_ARG_VMALLOC_OFF: + static_branch_disable(&kasan_flag_vmalloc); + break; + case KASAN_ARG_VMALLOC_ON: + static_branch_enable(&kasan_flag_vmalloc); + break; + } + + kasan_init_tags(); + + /* KASAN is now initialized, enable it. */ + static_branch_enable(&kasan_flag_enabled); + + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", + kasan_mode_info(), + kasan_vmalloc_enabled() ? "on" : "off", + kasan_stack_collection_enabled() ? "on" : "off"); +} + +#ifdef CONFIG_KASAN_VMALLOC + +static void unpoison_vmalloc_pages(const void *addr, u8 tag) +{ + struct vm_struct *area; + int i; + + /* + * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations + * (see the comment in __kasan_unpoison_vmalloc), all of the pages + * should belong to a single area. + */ + area = find_vm_area((void *)addr); + if (WARN_ON(!area)) + return; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + page_kasan_tag_set(page, tag); + } +} + +static void init_vmalloc_pages(const void *start, unsigned long size) +{ + const void *addr; + + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + struct page *page = vmalloc_to_page(addr); + + clear_highpage_kasan_tagged(page); + } +} + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + u8 tag; + unsigned long redzone_start, redzone_size; + + if (!kasan_vmalloc_enabled()) { + if (flags & KASAN_VMALLOC_INIT) + init_vmalloc_pages(start, size); + return (void *)start; + } + + /* + * Don't tag non-VM_ALLOC mappings, as: + * + * 1. Unlike the software KASAN modes, hardware tag-based KASAN only + * supports tagging physical memory. Therefore, it can only tag a + * single mapping of normal physical pages. + * 2. Hardware tag-based KASAN can only tag memory mapped with special + * mapping protection bits, see arch_vmap_pgprot_tagged(). + * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, + * providing these bits would require tracking all non-VM_ALLOC + * mappers. + * + * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags + * the first virtual mapping, which is created by vmalloc(). + * Tagging the page_alloc memory backing that vmalloc() allocation is + * skipped, see ___GFP_SKIP_KASAN_UNPOISON. + * + * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. + */ + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); + return (void *)start; + } + + /* + * Don't tag executable memory. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); + return (void *)start; + } + + tag = kasan_random_tag(); + start = set_tag(start, tag); + + /* Unpoison and initialize memory up to size. */ + kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT); + + /* + * Explicitly poison and initialize the in-page vmalloc() redzone. + * Unlike software KASAN modes, hardware tag-based KASAN doesn't + * unpoison memory when populating shadow for vmalloc() space. + */ + redzone_start = round_up((unsigned long)start + size, + KASAN_GRANULE_SIZE); + redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start; + kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID, + flags & KASAN_VMALLOC_INIT); + + /* + * Set per-page tag flags to allow accessing physical memory for the + * vmalloc() mapping through page_address(vmalloc_to_page()). + */ + unpoison_vmalloc_pages(start, tag); + + return (void *)start; +} + +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + /* + * No tagging here. + * The physical pages backing the vmalloc() allocation are poisoned + * through the usual page_alloc paths. + */ +} + +#endif + +void kasan_enable_tagging(void) +{ + if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC) + hw_enable_tagging_async(); + else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM) + hw_enable_tagging_asymm(); + else + hw_enable_tagging_sync(); +} + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +EXPORT_SYMBOL_GPL(kasan_enable_tagging); + +void kasan_force_async_fault(void) +{ + hw_force_async_tag_fault(); +} +EXPORT_SYMBOL_GPL(kasan_force_async_fault); + +#endif diff --git a/mm/kasan/init.c b/mm/kasan/init.c new file mode 100644 index 000000000..cc64ed685 --- /dev/null +++ b/mm/kasan/init.c @@ -0,0 +1,491 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains KASAN shadow initialization code. + * + * Copyright (c) 2015 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "kasan.h" + +/* + * This page serves two purposes: + * - It used as early shadow memory. The entire shadow region populated + * with this page, before we will be able to setup normal shadow memory. + * - Latter it reused it as zero shadow to cover large ranges of memory + * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). + */ +unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss; + +#if CONFIG_PGTABLE_LEVELS > 4 +p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss; +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d)); +} +#else +static inline bool kasan_p4d_table(pgd_t pgd) +{ + return false; +} +#endif +#if CONFIG_PGTABLE_LEVELS > 3 +pud_t kasan_early_shadow_pud[MAX_PTRS_PER_PUD] __page_aligned_bss; +static inline bool kasan_pud_table(p4d_t p4d) +{ + return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud)); +} +#else +static inline bool kasan_pud_table(p4d_t p4d) +{ + return false; +} +#endif +#if CONFIG_PGTABLE_LEVELS > 2 +pmd_t kasan_early_shadow_pmd[MAX_PTRS_PER_PMD] __page_aligned_bss; +static inline bool kasan_pmd_table(pud_t pud) +{ + return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd)); +} +#else +static inline bool kasan_pmd_table(pud_t pud) +{ + return false; +} +#endif +pte_t kasan_early_shadow_pte[MAX_PTRS_PER_PTE + PTE_HWTABLE_PTRS] + __page_aligned_bss; + +static inline bool kasan_pte_table(pmd_t pmd) +{ + return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte)); +} + +static inline bool kasan_early_shadow_page_entry(pte_t pte) +{ + return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page)); +} + +static __init void *early_alloc(size_t size, int node) +{ + void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); + + if (!ptr) + panic("%s: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n", + __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS)); + + return ptr; +} + +static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_t zero_pte; + + zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)), + PAGE_KERNEL); + zero_pte = pte_wrprotect(zero_pte); + + while (addr + PAGE_SIZE <= end) { + set_pte_at(&init_mm, addr, pte, zero_pte); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } +} + +static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, addr); + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (pmd_none(*pmd)) { + pte_t *p; + + if (slab_is_available()) + p = pte_alloc_one_kernel(&init_mm); + else + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, pmd, p); + } + zero_pte_populate(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + pud_t *pud = pud_offset(p4d, addr); + unsigned long next; + + do { + next = pud_addr_end(addr, end); + if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pmd_t *pmd; + + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (pud_none(*pud)) { + pmd_t *p; + + if (slab_is_available()) { + p = pmd_alloc(&init_mm, pud, addr); + if (!p) + return -ENOMEM; + } else { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } + zero_pmd_populate(pud, addr, next); + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + unsigned long next; + + do { + next = p4d_addr_end(addr, end); + if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + pud_t *pud; + pmd_t *pmd; + + p4d_populate(&init_mm, p4d, + lm_alias(kasan_early_shadow_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (p4d_none(*p4d)) { + pud_t *p; + + if (slab_is_available()) { + p = pud_alloc(&init_mm, p4d, addr); + if (!p) + return -ENOMEM; + } else { + p4d_populate(&init_mm, p4d, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } + zero_pud_populate(p4d, addr, next); + } while (p4d++, addr = next, addr != end); + + return 0; +} + +/** + * kasan_populate_early_shadow - populate shadow memory region with + * kasan_early_shadow_page + * @shadow_start: start of the memory range to populate + * @shadow_end: end of the memory range to populate + */ +int __ref kasan_populate_early_shadow(const void *shadow_start, + const void *shadow_end) +{ + unsigned long addr = (unsigned long)shadow_start; + unsigned long end = (unsigned long)shadow_end; + pgd_t *pgd = pgd_offset_k(addr); + unsigned long next; + + do { + next = pgd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + /* + * kasan_early_shadow_pud should be populated with pmds + * at this moment. + * [pud,pmd]_populate*() below needed only for + * 3,2 - level page tables where we don't have + * puds,pmds, so pgd_populate(), pud_populate() + * is noops. + */ + pgd_populate(&init_mm, pgd, + lm_alias(kasan_early_shadow_p4d)); + p4d = p4d_offset(pgd, addr); + p4d_populate(&init_mm, p4d, + lm_alias(kasan_early_shadow_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, + lm_alias(kasan_early_shadow_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_early_shadow_pte)); + continue; + } + + if (pgd_none(*pgd)) { + p4d_t *p; + + if (slab_is_available()) { + p = p4d_alloc(&init_mm, pgd, addr); + if (!p) + return -ENOMEM; + } else { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + } + zero_p4d_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); + + return 0; +} + +static void kasan_free_pte(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, (pte_t *)page_to_virt(pmd_page(*pmd))); + pmd_clear(pmd); +} + +static void kasan_free_pmd(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, (pmd_t *)page_to_virt(pud_page(*pud))); + pud_clear(pud); +} + +static void kasan_free_pud(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, (pud_t *)page_to_virt(p4d_page(*p4d))); + p4d_clear(p4d); +} + +static void kasan_free_p4d(p4d_t *p4d_start, pgd_t *pgd) +{ + p4d_t *p4d; + int i; + + for (i = 0; i < PTRS_PER_P4D; i++) { + p4d = p4d_start + i; + if (!p4d_none(*p4d)) + return; + } + + p4d_free(&init_mm, (p4d_t *)page_to_virt(pgd_page(*pgd))); + pgd_clear(pgd); +} + +static void kasan_remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (WARN_ON(!kasan_early_shadow_page_entry(*pte))) + continue; + pte_clear(&init_mm, addr, pte); + } +} + +static void kasan_remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pmd++) { + pte_t *pte; + + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (kasan_pte_table(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + pmd_clear(pmd); + continue; + } + } + pte = pte_offset_kernel(pmd, addr); + kasan_remove_pte_table(pte, addr, next); + kasan_free_pte(pte_offset_kernel(pmd, 0), pmd); + } +} + +static void kasan_remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, pud++) { + pmd_t *pmd, *pmd_base; + + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (kasan_pmd_table(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + continue; + } + } + pmd = pmd_offset(pud, addr); + pmd_base = pmd_offset(pud, 0); + kasan_remove_pmd_table(pmd, addr, next); + kasan_free_pmd(pmd_base, pud); + } +} + +static void kasan_remove_p4d_table(p4d_t *p4d, unsigned long addr, + unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next, p4d++) { + pud_t *pud; + + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + if (kasan_pud_table(*p4d)) { + if (IS_ALIGNED(addr, P4D_SIZE) && + IS_ALIGNED(next, P4D_SIZE)) { + p4d_clear(p4d); + continue; + } + } + pud = pud_offset(p4d, addr); + kasan_remove_pud_table(pud, addr, next); + kasan_free_pud(pud_offset(p4d, 0), p4d); + } +} + +void kasan_remove_zero_shadow(void *start, unsigned long size) +{ + unsigned long addr, end, next; + pgd_t *pgd; + + addr = (unsigned long)kasan_mem_to_shadow(start); + end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) || + WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE)) + return; + + for (; addr < end; addr = next) { + p4d_t *p4d; + + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (kasan_p4d_table(*pgd)) { + if (IS_ALIGNED(addr, PGDIR_SIZE) && + IS_ALIGNED(next, PGDIR_SIZE)) { + pgd_clear(pgd); + continue; + } + } + + p4d = p4d_offset(pgd, addr); + kasan_remove_p4d_table(p4d, addr, next); + kasan_free_p4d(p4d_offset(pgd, 0), pgd); + } +} + +int kasan_add_zero_shadow(void *start, unsigned long size) +{ + int ret; + void *shadow_start, *shadow_end; + + shadow_start = kasan_mem_to_shadow(start); + shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); + + if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) || + WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE)) + return -EINVAL; + + ret = kasan_populate_early_shadow(shadow_start, shadow_end); + if (ret) + kasan_remove_zero_shadow(start, size); + return ret; +} diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h new file mode 100644 index 000000000..a898f05a2 --- /dev/null +++ b/mm/kasan/kasan.h @@ -0,0 +1,635 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __MM_KASAN_KASAN_H +#define __MM_KASAN_KASAN_H + +#include +#include +#include +#include +#include + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +#include + +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +static inline bool kasan_stack_collection_enabled(void) +{ + return static_branch_unlikely(&kasan_flag_stacktrace); +} + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_stack_collection_enabled(void) +{ + return true; +} + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS + +#include "../slab.h" + +DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); + +enum kasan_mode { + KASAN_MODE_SYNC, + KASAN_MODE_ASYNC, + KASAN_MODE_ASYMM, +}; + +extern enum kasan_mode kasan_mode __ro_after_init; + +static inline bool kasan_vmalloc_enabled(void) +{ + return static_branch_likely(&kasan_flag_vmalloc); +} + +static inline bool kasan_async_fault_possible(void) +{ + return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM; +} + +static inline bool kasan_sync_fault_possible(void) +{ + return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; +} + +#else /* CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_async_fault_possible(void) +{ + return false; +} + +static inline bool kasan_sync_fault_possible(void) +{ + return true; +} + +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/* Generic KASAN uses per-object metadata to store stack traces. */ +static inline bool kasan_requires_meta(void) +{ + /* + * Technically, Generic KASAN always collects stack traces right now. + * However, let's use kasan_stack_collection_enabled() in case the + * kasan.stacktrace command-line argument is changed to affect + * Generic KASAN. + */ + return kasan_stack_collection_enabled(); +} + +#else /* CONFIG_KASAN_GENERIC */ + +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline bool kasan_requires_meta(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_GENERIC */ + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) +#else +#include +#define KASAN_GRANULE_SIZE MTE_GRANULE_SIZE +#endif + +#define KASAN_GRANULE_MASK (KASAN_GRANULE_SIZE - 1) + +#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) + +#ifdef CONFIG_KASAN_GENERIC +#define KASAN_PAGE_FREE 0xFF /* freed page */ +#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */ +#define KASAN_SLAB_REDZONE 0xFC /* redzone for slab object */ +#define KASAN_SLAB_FREE 0xFB /* freed slab object */ +#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */ +#else +#define KASAN_PAGE_FREE KASAN_TAG_INVALID +#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID +#define KASAN_SLAB_REDZONE KASAN_TAG_INVALID +#define KASAN_SLAB_FREE KASAN_TAG_INVALID +#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */ +#endif + +#ifdef CONFIG_KASAN_GENERIC + +#define KASAN_SLAB_FREETRACK 0xFA /* freed slab object with free track */ +#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */ + +/* Stack redzone shadow values. Compiler ABI, do not change. */ +#define KASAN_STACK_LEFT 0xF1 +#define KASAN_STACK_MID 0xF2 +#define KASAN_STACK_RIGHT 0xF3 +#define KASAN_STACK_PARTIAL 0xF4 + +/* alloca redzone shadow values. */ +#define KASAN_ALLOCA_LEFT 0xCA +#define KASAN_ALLOCA_RIGHT 0xCB + +/* alloca redzone size. Compiler ABI, do not change. */ +#define KASAN_ALLOCA_REDZONE_SIZE 32 + +/* Stack frame marker. Compiler ABI, do not change. */ +#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3 + +/* Dummy value to avoid breaking randconfig/all*config builds. */ +#ifndef KASAN_ABI_VERSION +#define KASAN_ABI_VERSION 1 +#endif + +#endif /* CONFIG_KASAN_GENERIC */ + +/* Metadata layout customization. */ +#define META_BYTES_PER_BLOCK 1 +#define META_BLOCKS_PER_ROW 16 +#define META_BYTES_PER_ROW (META_BLOCKS_PER_ROW * META_BYTES_PER_BLOCK) +#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) +#define META_ROWS_AROUND_ADDR 2 + +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + +enum kasan_report_type { + KASAN_REPORT_ACCESS, + KASAN_REPORT_INVALID_FREE, + KASAN_REPORT_DOUBLE_FREE, +}; + +struct kasan_report_info { + /* Filled in by kasan_report_*(). */ + enum kasan_report_type type; + void *access_addr; + size_t access_size; + bool is_write; + unsigned long ip; + + /* Filled in by the common reporting code. */ + void *first_bad_addr; + struct kmem_cache *cache; + void *object; + + /* Filled in by the mode-specific reporting code. */ + const char *bug_type; + struct kasan_track alloc_track; + struct kasan_track free_track; +}; + +/* Do not change the struct layout: compiler ABI. */ +struct kasan_source_location { + const char *filename; + int line_no; + int column_no; +}; + +/* Do not change the struct layout: compiler ABI. */ +struct kasan_global { + const void *beg; /* Address of the beginning of the global variable. */ + size_t size; /* Size of the global variable. */ + size_t size_with_redzone; /* Size of the variable + size of the redzone. 32 bytes aligned. */ + const void *name; + const void *module_name; /* Name of the module where the global variable is declared. */ + unsigned long has_dynamic_init; /* This is needed for C++. */ +#if KASAN_ABI_VERSION >= 4 + struct kasan_source_location *location; +#endif +#if KASAN_ABI_VERSION >= 5 + char *odr_indicator; +#endif +}; + +/* Structures for keeping alloc and free meta. */ + +#ifdef CONFIG_KASAN_GENERIC + +struct kasan_alloc_meta { + struct kasan_track alloc_track; + /* Free track is stored in kasan_free_meta. */ + depot_stack_handle_t aux_stack[2]; +}; + +struct qlist_node { + struct qlist_node *next; +}; + +/* + * Free meta is stored either in the object itself or in the redzone after the + * object. In the former case, free meta offset is 0. In the latter case, the + * offset is between 0 and INT_MAX. INT_MAX marks that free meta is not present. + */ +#define KASAN_NO_FREE_META INT_MAX + +/* + * Free meta is only used by Generic mode while the object is in quarantine. + * After that, slab allocator stores the freelist pointer in the object. + */ +struct kasan_free_meta { + struct qlist_node quarantine_link; + struct kasan_track free_track; +}; + +#endif /* CONFIG_KASAN_GENERIC */ + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +struct kasan_stack_ring_entry { + void *ptr; + size_t size; + u32 pid; + depot_stack_handle_t stack; + bool is_free; +}; + +struct kasan_stack_ring { + rwlock_t lock; + size_t size; + atomic64_t pos; + struct kasan_stack_ring_entry *entries; +}; + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) +/* Used in KUnit-compatible KASAN tests. */ +struct kunit_kasan_status { + bool report_found; + bool sync_fault; +}; +#endif + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + +static inline const void *kasan_shadow_to_mem(const void *shadow_addr) +{ + return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) + << KASAN_SHADOW_SCALE_SHIFT); +} + +static inline bool addr_has_metadata(const void *addr) +{ + return (kasan_reset_tag(addr) >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +/** + * kasan_check_range - Check memory region, and report if invalid access. + * @addr: the accessed address + * @size: the accessed size + * @write: true if access is a write access + * @ret_ip: return address + * @return: true if access was valid, false if invalid + */ +bool kasan_check_range(unsigned long addr, size_t size, bool write, + unsigned long ret_ip); + +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +static inline bool addr_has_metadata(const void *addr) +{ + return (is_vmalloc_addr(addr) || virt_addr_valid(addr)); +} + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +void *kasan_find_first_bad_addr(void *addr, size_t size); +void kasan_complete_mode_report_info(struct kasan_report_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void kasan_print_tags(u8 addr_tag, const void *addr); +#else +static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } +#endif + +#if defined(CONFIG_KASAN_STACK) +void kasan_print_address_stack_frame(const void *addr); +#else +static inline void kasan_print_address_stack_frame(const void *addr) { } +#endif + +#ifdef CONFIG_KASAN_GENERIC +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } +#endif + +bool kasan_report(unsigned long addr, size_t size, + bool is_write, unsigned long ip); +void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); + +struct slab *kasan_addr_to_slab(const void *addr); + +#ifdef CONFIG_KASAN_GENERIC +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#else +static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { } +static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } +#endif + +depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); +void kasan_set_track(struct kasan_track *track, gfp_t flags); +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); +void kasan_save_free_info(struct kmem_cache *cache, void *object); + +#if defined(CONFIG_KASAN_GENERIC) && \ + (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) +bool kasan_quarantine_put(struct kmem_cache *cache, void *object); +void kasan_quarantine_reduce(void); +void kasan_quarantine_remove_cache(struct kmem_cache *cache); +#else +static inline bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { return false; } +static inline void kasan_quarantine_reduce(void) { } +static inline void kasan_quarantine_remove_cache(struct kmem_cache *cache) { } +#endif + +#ifndef arch_kasan_set_tag +static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) +{ + return addr; +} +#endif +#ifndef arch_kasan_get_tag +#define arch_kasan_get_tag(addr) 0 +#endif + +#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag))) +#define get_tag(addr) arch_kasan_get_tag(addr) + +#ifdef CONFIG_KASAN_HW_TAGS + +#ifndef arch_enable_tagging_sync +#define arch_enable_tagging_sync() +#endif +#ifndef arch_enable_tagging_async +#define arch_enable_tagging_async() +#endif +#ifndef arch_enable_tagging_asymm +#define arch_enable_tagging_asymm() +#endif +#ifndef arch_force_async_tag_fault +#define arch_force_async_tag_fault() +#endif +#ifndef arch_get_random_tag +#define arch_get_random_tag() (0xFF) +#endif +#ifndef arch_get_mem_tag +#define arch_get_mem_tag(addr) (0xFF) +#endif +#ifndef arch_set_mem_tag_range +#define arch_set_mem_tag_range(addr, size, tag, init) ((void *)(addr)) +#endif + +#define hw_enable_tagging_sync() arch_enable_tagging_sync() +#define hw_enable_tagging_async() arch_enable_tagging_async() +#define hw_enable_tagging_asymm() arch_enable_tagging_asymm() +#define hw_force_async_tag_fault() arch_force_async_tag_fault() +#define hw_get_random_tag() arch_get_random_tag() +#define hw_get_mem_tag(addr) arch_get_mem_tag(addr) +#define hw_set_mem_tag_range(addr, size, tag, init) \ + arch_set_mem_tag_range((addr), (size), (tag), (init)) + +void kasan_enable_tagging(void); + +#else /* CONFIG_KASAN_HW_TAGS */ + +#define hw_enable_tagging_sync() +#define hw_enable_tagging_async() +#define hw_enable_tagging_asymm() + +static inline void kasan_enable_tagging(void) { } + +#endif /* CONFIG_KASAN_HW_TAGS */ + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void __init kasan_init_tags(void); +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_force_async_fault(void); + +#else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ + +static inline void kasan_force_async_fault(void) { } + +#endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ + +#ifdef CONFIG_KASAN_SW_TAGS +u8 kasan_random_tag(void); +#elif defined(CONFIG_KASAN_HW_TAGS) +static inline u8 kasan_random_tag(void) { return hw_get_random_tag(); } +#else +static inline u8 kasan_random_tag(void) { return 0; } +#endif + +#ifdef CONFIG_KASAN_HW_TAGS + +static inline void kasan_poison(const void *addr, size_t size, u8 value, bool init) +{ + addr = kasan_reset_tag(addr); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + hw_set_mem_tag_range((void *)addr, size, value, init); +} + +static inline void kasan_unpoison(const void *addr, size_t size, bool init) +{ + u8 tag = get_tag(addr); + + addr = kasan_reset_tag(addr); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + /* + * Explicitly initialize the memory with the precise object size to + * avoid overwriting the slab redzone. This disables initialization in + * the arch code and may thus lead to performance penalty. This penalty + * does not affect production builds, as slab redzones are not enabled + * there. + */ + if (__slub_debug_enabled() && + init && ((unsigned long)size & KASAN_GRANULE_MASK)) { + init = false; + memzero_explicit((void *)addr, size); + } + size = round_up(size, KASAN_GRANULE_SIZE); + + hw_set_mem_tag_range((void *)addr, size, tag, init); +} + +static inline bool kasan_byte_accessible(const void *addr) +{ + u8 ptr_tag = get_tag(addr); + u8 mem_tag = hw_get_mem_tag((void *)addr); + + return ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag; +} + +#else /* CONFIG_KASAN_HW_TAGS */ + +/** + * kasan_poison - mark the memory range as inaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size, must be aligned to KASAN_GRANULE_SIZE + * @value - value that's written to metadata for the range + * @init - whether to initialize the memory range (only for hardware tag-based) + * + * The size gets aligned to KASAN_GRANULE_SIZE before marking the range. + */ +void kasan_poison(const void *addr, size_t size, u8 value, bool init); + +/** + * kasan_unpoison - mark the memory range as accessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size, can be unaligned + * @init - whether to initialize the memory range (only for hardware tag-based) + * + * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before + * marking the range. + * For the generic mode, the last granule of the memory range gets partially + * unpoisoned based on the @size. + */ +void kasan_unpoison(const void *addr, size_t size, bool init); + +bool kasan_byte_accessible(const void *addr); + +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/** + * kasan_poison_last_granule - mark the last granule of the memory range as + * inaccessible + * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE + * @size - range size + * + * This function is only available for the generic mode, as it's the only mode + * that has partially poisoned memory granules. + */ +void kasan_poison_last_granule(const void *address, size_t size); + +#else /* CONFIG_KASAN_GENERIC */ + +static inline void kasan_poison_last_granule(const void *address, size_t size) { } + +#endif /* CONFIG_KASAN_GENERIC */ + +#ifndef kasan_arch_is_ready +static inline bool kasan_arch_is_ready(void) { return true; } +#elif !defined(CONFIG_KASAN_GENERIC) || !defined(CONFIG_KASAN_OUTLINE) +#error kasan_arch_is_ready only works in KASAN generic outline mode! +#endif + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + +bool kasan_save_enable_multi_shot(void); +void kasan_restore_multi_shot(bool enabled); + +#endif + +/* + * Exported functions for interfaces called from assembly or from generated + * code. Declared here to avoid warnings about missing declarations. + */ + +asmlinkage void kasan_unpoison_task_stack_below(const void *watermark); +void __asan_register_globals(struct kasan_global *globals, size_t size); +void __asan_unregister_globals(struct kasan_global *globals, size_t size); +void __asan_handle_no_return(void); +void __asan_alloca_poison(unsigned long addr, size_t size); +void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); + +void __asan_load1(unsigned long addr); +void __asan_store1(unsigned long addr); +void __asan_load2(unsigned long addr); +void __asan_store2(unsigned long addr); +void __asan_load4(unsigned long addr); +void __asan_store4(unsigned long addr); +void __asan_load8(unsigned long addr); +void __asan_store8(unsigned long addr); +void __asan_load16(unsigned long addr); +void __asan_store16(unsigned long addr); +void __asan_loadN(unsigned long addr, size_t size); +void __asan_storeN(unsigned long addr, size_t size); + +void __asan_load1_noabort(unsigned long addr); +void __asan_store1_noabort(unsigned long addr); +void __asan_load2_noabort(unsigned long addr); +void __asan_store2_noabort(unsigned long addr); +void __asan_load4_noabort(unsigned long addr); +void __asan_store4_noabort(unsigned long addr); +void __asan_load8_noabort(unsigned long addr); +void __asan_store8_noabort(unsigned long addr); +void __asan_load16_noabort(unsigned long addr); +void __asan_store16_noabort(unsigned long addr); +void __asan_loadN_noabort(unsigned long addr, size_t size); +void __asan_storeN_noabort(unsigned long addr, size_t size); + +void __asan_report_load1_noabort(unsigned long addr); +void __asan_report_store1_noabort(unsigned long addr); +void __asan_report_load2_noabort(unsigned long addr); +void __asan_report_store2_noabort(unsigned long addr); +void __asan_report_load4_noabort(unsigned long addr); +void __asan_report_store4_noabort(unsigned long addr); +void __asan_report_load8_noabort(unsigned long addr); +void __asan_report_store8_noabort(unsigned long addr); +void __asan_report_load16_noabort(unsigned long addr); +void __asan_report_store16_noabort(unsigned long addr); +void __asan_report_load_n_noabort(unsigned long addr, size_t size); +void __asan_report_store_n_noabort(unsigned long addr, size_t size); + +void __asan_set_shadow_00(const void *addr, size_t size); +void __asan_set_shadow_f1(const void *addr, size_t size); +void __asan_set_shadow_f2(const void *addr, size_t size); +void __asan_set_shadow_f3(const void *addr, size_t size); +void __asan_set_shadow_f5(const void *addr, size_t size); +void __asan_set_shadow_f8(const void *addr, size_t size); + +void __hwasan_load1_noabort(unsigned long addr); +void __hwasan_store1_noabort(unsigned long addr); +void __hwasan_load2_noabort(unsigned long addr); +void __hwasan_store2_noabort(unsigned long addr); +void __hwasan_load4_noabort(unsigned long addr); +void __hwasan_store4_noabort(unsigned long addr); +void __hwasan_load8_noabort(unsigned long addr); +void __hwasan_store8_noabort(unsigned long addr); +void __hwasan_load16_noabort(unsigned long addr); +void __hwasan_store16_noabort(unsigned long addr); +void __hwasan_loadN_noabort(unsigned long addr, size_t size); +void __hwasan_storeN_noabort(unsigned long addr, size_t size); + +void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size); + +void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, + unsigned long ret_ip); + +#endif /* __MM_KASAN_KASAN_H */ diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c new file mode 100644 index 000000000..0d59098f0 --- /dev/null +++ b/mm/kasan/kasan_test.c @@ -0,0 +1,1457 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "kasan.h" + +#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) + +/* + * Some tests use these global variables to store return values from function + * calls that could otherwise be eliminated by the compiler as dead code. + */ +void *kasan_ptr_result; +int kasan_int_result; + +static struct kunit_resource resource; +static struct kunit_kasan_status test_status; +static bool multishot; + +/* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the + * first detected bug and panic the kernel if panic_on_warn is enabled. For + * hardware tag-based KASAN also allow tag checking to be reenabled for each + * test, see the comment for KUNIT_EXPECT_KASAN_FAIL(). + */ +static int kasan_test_init(struct kunit *test) +{ + if (!kasan_enabled()) { + kunit_err(test, "can't run KASAN tests with KASAN disabled"); + return -1; + } + + multishot = kasan_save_enable_multi_shot(); + test_status.report_found = false; + test_status.sync_fault = false; + kunit_add_named_resource(test, NULL, NULL, &resource, + "kasan_status", &test_status); + return 0; +} + +static void kasan_test_exit(struct kunit *test) +{ + kasan_restore_multi_shot(multishot); + KUNIT_EXPECT_FALSE(test, test_status.report_found); +} + +/** + * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a + * KASAN report; causes a test failure otherwise. This relies on a KUnit + * resource named "kasan_status". Do not use this name for KUnit resources + * outside of KASAN tests. + * + * For hardware tag-based KASAN, when a synchronous tag fault happens, tag + * checking is auto-disabled. When this happens, this test handler reenables + * tag checking. As tag checking can be only disabled or enabled per CPU, + * this handler disables migration (preemption). + * + * Since the compiler doesn't see that the expression can change the test_status + * fields, it can reorder or optimize away the accesses to those fields. + * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the + * expression to prevent that. + * + * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept + * as false. This allows detecting KASAN reports that happen outside of the + * checks by asserting !test_status.report_found at the start of + * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit. + */ +#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + kasan_sync_fault_possible()) \ + migrate_disable(); \ + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \ + barrier(); \ + expression; \ + barrier(); \ + if (kasan_async_fault_possible()) \ + kasan_force_async_fault(); \ + if (!READ_ONCE(test_status.report_found)) { \ + KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ + "expected in \"" #expression \ + "\", but none occurred"); \ + } \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ + kasan_sync_fault_possible()) { \ + if (READ_ONCE(test_status.report_found) && \ + READ_ONCE(test_status.sync_fault)) \ + kasan_enable_tagging(); \ + migrate_enable(); \ + } \ + WRITE_ONCE(test_status.report_found, false); \ +} while (0) + +#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ + if (!IS_ENABLED(config)) \ + kunit_skip((test), "Test requires " #config "=y"); \ +} while (0) + +#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \ + if (IS_ENABLED(config)) \ + kunit_skip((test), "Test requires " #config "=n"); \ +} while (0) + +static void kmalloc_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + /* + * An unaligned access past the requested kmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x'); + + /* + * An aligned access into the first out-of-bounds granule that falls + * within the aligned kmalloc object. + */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y'); + + /* Out-of-bounds access past the aligned kmalloc object. */ + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = + ptr[size + KASAN_GRANULE_SIZE + 5]); + + kfree(ptr); +} + +static void kmalloc_oob_left(struct kunit *test) +{ + char *ptr; + size_t size = 15; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1)); + kfree(ptr); +} + +static void kmalloc_node_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = 4096; + + ptr = kmalloc_node(size, GFP_KERNEL, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + kfree(ptr); +} + +/* + * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't + * fit into a slab cache and therefore is allocated via the page allocator + * fallback. Since this kind of fallback is only implemented for SLUB, these + * tests are limited to that allocator. + */ +static void kmalloc_pagealloc_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0); + + kfree(ptr); +} + +static void kmalloc_pagealloc_uaf(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); +} + +static void kmalloc_pagealloc_invalid_free(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE + 10; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1)); +} + +static void pagealloc_oob_right(struct kunit *test) +{ + char *ptr; + struct page *pages; + size_t order = 4; + size_t size = (1UL << (PAGE_SHIFT + order)); + + /* + * With generic KASAN page allocations have no redzones, thus + * out-of-bounds detection is not guaranteed. + * See https://bugzilla.kernel.org/show_bug.cgi?id=210503. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + free_pages((unsigned long)ptr, order); +} + +static void pagealloc_uaf(struct kunit *test) +{ + char *ptr; + struct page *pages; + size_t order = 4; + + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + free_pages((unsigned long)ptr, order); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); +} + +static void kmalloc_large_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = KMALLOC_MAX_CACHE_SIZE - 256; + + /* + * Allocate a chunk that is large enough, but still fits into a slab + * and does not trigger the page allocator fallback in SLUB. + */ + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + kfree(ptr); +} + +static void krealloc_more_oob_helper(struct kunit *test, + size_t size1, size_t size2) +{ + char *ptr1, *ptr2; + size_t middle; + + KUNIT_ASSERT_LT(test, size1, size2); + middle = size1 + (size2 - size1) / 2; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = krealloc(ptr1, size2, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + + /* All offsets up to size2 must be accessible. */ + ptr2[size1 - 1] = 'x'; + ptr2[size1] = 'x'; + ptr2[middle] = 'x'; + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + + kfree(ptr2); +} + +static void krealloc_less_oob_helper(struct kunit *test, + size_t size1, size_t size2) +{ + char *ptr1, *ptr2; + size_t middle; + + KUNIT_ASSERT_LT(test, size2, size1); + middle = size2 + (size1 - size2) / 2; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = krealloc(ptr1, size2, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + + /* Must be accessible for all modes. */ + ptr2[size2 - 1] = 'x'; + + /* Generic mode is precise, so unaligned size2 must be inaccessible. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x'); + + /* For all modes first aligned offset after size2 must be inaccessible. */ + KUNIT_EXPECT_KASAN_FAIL(test, + ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x'); + + /* + * For all modes all size2, middle, and size1 should land in separate + * granules and thus the latter two offsets should be inaccessible. + */ + KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE), + round_down(middle, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE), + round_down(size1, KASAN_GRANULE_SIZE)); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x'); + KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x'); + + kfree(ptr2); +} + +static void krealloc_more_oob(struct kunit *test) +{ + krealloc_more_oob_helper(test, 201, 235); +} + +static void krealloc_less_oob(struct kunit *test) +{ + krealloc_less_oob_helper(test, 235, 201); +} + +static void krealloc_pagealloc_more_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201, + KMALLOC_MAX_CACHE_SIZE + 235); +} + +static void krealloc_pagealloc_less_oob(struct kunit *test) +{ + /* page_alloc fallback in only implemented for SLUB. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); + + krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235, + KMALLOC_MAX_CACHE_SIZE + 201); +} + +/* + * Check that krealloc() detects a use-after-free, returns NULL, + * and doesn't unpoison the freed object. + */ +static void krealloc_uaf(struct kunit *test) +{ + char *ptr1, *ptr2; + int size1 = 201; + int size2 = 235; + + ptr1 = kmalloc(size1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL)); + KUNIT_ASSERT_NULL(test, ptr2); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1); +} + +static void kmalloc_oob_16(struct kunit *test) +{ + struct { + u64 words[2]; + } *ptr1, *ptr2; + + /* This test is specifically crafted for the generic mode. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + OPTIMIZER_HIDE_VAR(ptr1); + OPTIMIZER_HIDE_VAR(ptr2); + KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + kfree(ptr1); + kfree(ptr2); +} + +static void kmalloc_uaf_16(struct kunit *test) +{ + struct { + u64 words[2]; + } *ptr1, *ptr2; + + ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + kfree(ptr1); +} + +/* + * Note: in the memset tests below, the written range touches both valid and + * invalid memory. This makes sure that the instrumentation does not only check + * the starting address but the whole range. + */ + +static void kmalloc_oob_memset_2(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2)); + kfree(ptr); +} + +static void kmalloc_oob_memset_4(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4)); + kfree(ptr); +} + +static void kmalloc_oob_memset_8(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8)); + kfree(ptr); +} + +static void kmalloc_oob_memset_16(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16)); + kfree(ptr); +} + +static void kmalloc_oob_in_memset(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, + memset(ptr, 0, size + KASAN_GRANULE_SIZE)); + kfree(ptr); +} + +static void kmalloc_memmove_negative_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + size_t invalid_size = -2; + + /* + * Hardware tag-based mode doesn't check memmove for negative size. + * As a result, this test introduces a side-effect memory corruption, + * which can result in a crash. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + +static void kmalloc_memmove_invalid_size(struct kunit *test) +{ + char *ptr; + size_t size = 64; + size_t invalid_size = size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + memset((char *)ptr, 0, 64); + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); + KUNIT_EXPECT_KASAN_FAIL(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + kfree(ptr); +} + +static void kmalloc_uaf(struct kunit *test) +{ + char *ptr; + size_t size = 10; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]); +} + +static void kmalloc_uaf_memset(struct kunit *test) +{ + char *ptr; + size_t size = 33; + + /* + * Only generic KASAN uses quarantine, which is required to avoid a + * kernel memory corruption this test causes. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size)); +} + +static void kmalloc_uaf2(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 43; + int counter = 0; + +again: + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + + /* + * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same. + * Allow up to 16 attempts at generating different tags. + */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) { + kfree(ptr2); + goto again; + } + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]); + KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); + + kfree(ptr2); +} + +/* + * Check that KASAN detects use-after-free when another object was allocated in + * the same slot. Relevant for the tag-based modes, which do not use quarantine. + */ +static void kmalloc_uaf3(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 100; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); +} + +static void kfree_via_page(struct kunit *test) +{ + char *ptr; + size_t size = 8; + struct page *page; + unsigned long offset; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + page = virt_to_page(ptr); + offset = offset_in_page(ptr); + kfree(page_address(page) + offset); +} + +static void kfree_via_phys(struct kunit *test) +{ + char *ptr; + size_t size = 8; + phys_addr_t phys; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + phys = virt_to_phys(ptr); + kfree(phys_to_virt(phys)); +} + +static void kmem_cache_oob(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]); + + kmem_cache_free(cache, p); + kmem_cache_destroy(cache); +} + +static void kmem_cache_accounted(struct kunit *test) +{ + int i; + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + /* + * Several allocations with a delay to allow for lazy per memcg kmem + * cache creation. + */ + for (i = 0; i < 5; i++) { + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) + goto free_cache; + + kmem_cache_free(cache, p); + msleep(100); + } + +free_cache: + kmem_cache_destroy(cache); +} + +static void kmem_cache_bulk(struct kunit *test) +{ + struct kmem_cache *cache; + size_t size = 200; + char *p[10]; + bool ret; + int i; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p); + if (!ret) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + for (i = 0; i < ARRAY_SIZE(p); i++) + p[i][0] = p[i][size - 1] = 42; + + kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p); + kmem_cache_destroy(cache); +} + +static char global_array[10]; + +static void kasan_global_oob_right(struct kunit *test) +{ + /* + * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS + * from failing here and panicking the kernel, access the array via a + * volatile pointer, which will prevent the compiler from being able to + * determine the array bounds. + * + * This access uses a volatile pointer to char (char *volatile) rather + * than the more conventional pointer to volatile char (volatile char *) + * because we want to prevent the compiler from making inferences about + * the pointer itself (i.e. its array bounds), not the data that it + * refers to. + */ + char *volatile array = global_array; + char *p = &array[ARRAY_SIZE(global_array) + 3]; + + /* Only generic mode instruments globals. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kasan_global_oob_left(struct kunit *test) +{ + char *volatile array = global_array; + char *p = array - 3; + + /* + * GCC is known to fail this test, skip it. + * See https://bugzilla.kernel.org/show_bug.cgi?id=215051. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +/* Check that ksize() makes the whole object accessible. */ +static void ksize_unpoisons_memory(struct kunit *test) +{ + char *ptr; + size_t size = 123, real_size; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + real_size = ksize(ptr); + + OPTIMIZER_HIDE_VAR(ptr); + + /* This access shouldn't trigger a KASAN report. */ + ptr[size] = 'x'; + + /* This one must. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]); + + kfree(ptr); +} + +/* + * Check that a use-after-free is detected by ksize() and via normal accesses + * after it. + */ +static void ksize_uaf(struct kunit *test) +{ + char *ptr; + int size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); +} + +static void kasan_stack_oob(struct kunit *test) +{ + char stack_array[10]; + /* See comment in kasan_global_oob_right. */ + char *volatile array = stack_array; + char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF]; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kasan_alloca_oob_left(struct kunit *test) +{ + volatile int i = 10; + char alloca_array[i]; + /* See comment in kasan_global_oob_right. */ + char *volatile array = alloca_array; + char *p = array - 1; + + /* Only generic mode instruments dynamic allocas. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kasan_alloca_oob_right(struct kunit *test) +{ + volatile int i = 10; + char alloca_array[i]; + /* See comment in kasan_global_oob_right. */ + char *volatile array = alloca_array; + char *p = array + i; + + /* Only generic mode instruments dynamic allocas. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); + + KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); +} + +static void kmem_cache_double_free(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + kmem_cache_free(cache, p); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p)); + kmem_cache_destroy(cache); +} + +static void kmem_cache_invalid_free(struct kunit *test) +{ + char *p; + size_t size = 200; + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + /* Trigger invalid free, the object doesn't get freed. */ + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1)); + + /* + * Properly free the object to prevent the "Objects remaining in + * test_cache on __kmem_cache_shutdown" BUG failure. + */ + kmem_cache_free(cache, p); + + kmem_cache_destroy(cache); +} + +static void empty_cache_ctor(void *object) { } + +static void kmem_cache_double_destroy(struct kunit *test) +{ + struct kmem_cache *cache; + + /* Provide a constructor to prevent cache merging. */ + cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + kmem_cache_destroy(cache); + KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache)); +} + +static void kasan_memchr(struct kunit *test) +{ + char *ptr; + size_t size = 24; + + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); + + if (OOB_TAG_OFF) + size = round_up(size, OOB_TAG_OFF); + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_ptr_result = memchr(ptr, '1', size + 1)); + + kfree(ptr); +} + +static void kasan_memcmp(struct kunit *test) +{ + char *ptr; + size_t size = 24; + int arr[9]; + + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); + + if (OOB_TAG_OFF) + size = round_up(size, OOB_TAG_OFF); + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + memset(arr, 0, sizeof(arr)); + + OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(size); + KUNIT_EXPECT_KASAN_FAIL(test, + kasan_int_result = memcmp(ptr, arr, size+1)); + kfree(ptr); +} + +static void kasan_strings(struct kunit *test) +{ + char *ptr; + size_t size = 24; + + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); + + ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree(ptr); + + /* + * Try to cause only 1 invalid access (less spam in dmesg). + * For that we need ptr to point to zeroed byte. + * Skip metadata that could be stored in freed object so ptr + * will likely point to zeroed byte. + */ + ptr += 16; + KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1')); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1')); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2")); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1)); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr)); + + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1)); +} + +static void kasan_bitops_modify(struct kunit *test, int nr, void *addr) +{ + KUNIT_EXPECT_KASAN_FAIL(test, set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(nr, addr)); +} + +static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) +{ + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); + +#if defined(clear_bit_unlock_is_negative_byte) + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = + clear_bit_unlock_is_negative_byte(nr, addr)); +#endif +} + +static void kasan_bitops_generic(struct kunit *test) +{ + long *bits; + + /* This test is specifically crafted for the generic mode. */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + + /* + * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes; + * this way we do not actually corrupt other memory. + */ + bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); + + /* + * Below calls try to access bit within allocated memory; however, the + * below accesses are still out-of-bounds, since bitops are defined to + * operate on the whole long the bit is in. + */ + kasan_bitops_modify(test, BITS_PER_LONG, bits); + + /* + * Below calls try to access bit beyond allocated memory. + */ + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, bits); + + kfree(bits); +} + +static void kasan_bitops_tags(struct kunit *test) +{ + long *bits; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */ + bits = kzalloc(48, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); + + /* Do the accesses past the 48 allocated bytes, but within the redone. */ + kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48); + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48); + + kfree(bits); +} + +static void kmalloc_double_kzfree(struct kunit *test) +{ + char *ptr; + size_t size = 16; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + kfree_sensitive(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); +} + +static void vmalloc_helpers_tags(struct kunit *test) +{ + void *ptr; + + /* This test is intended for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + ptr = vmalloc(PAGE_SIZE); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Check that the returned pointer is tagged. */ + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure exported vmalloc helpers handle tagged pointers. */ + KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr)); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr)); + +#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST) + { + int rv; + + /* Make sure vmalloc'ed memory permissions can be changed. */ + rv = set_memory_ro((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + rv = set_memory_rw((unsigned long)ptr, 1); + KUNIT_ASSERT_GE(test, rv, 0); + } +#endif + + vfree(ptr); +} + +static void vmalloc_oob(struct kunit *test) +{ + char *v_ptr, *p_ptr; + struct page *page; + size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5; + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + v_ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + OPTIMIZER_HIDE_VAR(v_ptr); + + /* + * We have to be careful not to hit the guard page in vmalloc tests. + * The MMU will catch that and crash us. + */ + + /* Make sure in-bounds accesses are valid. */ + v_ptr[0] = 0; + v_ptr[size - 1] = 0; + + /* + * An unaligned access past the requested vmalloc size. + * Only generic KASAN can precisely detect these. + */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); + + /* An aligned access into the first out-of-bounds granule. */ + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + + /* Check that in-bounds accesses to the physical page are valid. */ + page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + p_ptr[0] = 0; + + vfree(v_ptr); + + /* + * We can't check for use-after-unmap bugs in this nor in the following + * vmalloc tests, as the page might be fully unmapped and accessing it + * will crash the kernel. + */ +} + +static void vmap_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *p_page, *v_page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vmap mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); + + p_page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page); + p_ptr = page_address(p_page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + /* + * We can't check for out-of-bounds bugs in this nor in the following + * vmalloc tests, as allocations have page granularity and accessing + * the guard page will crash the kernel. + */ + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + /* Make sure vmalloc_to_page() correctly recovers the page pointer. */ + v_page = vmalloc_to_page(v_ptr); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page); + KUNIT_EXPECT_PTR_EQ(test, p_page, v_page); + + vunmap(v_ptr); + free_pages((unsigned long)p_ptr, 1); +} + +static void vm_map_ram_tags(struct kunit *test) +{ + char *p_ptr, *v_ptr; + struct page *page; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons vm_map_ram mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + page = alloc_pages(GFP_KERNEL, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page); + p_ptr = page_address(page); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr); + + v_ptr = vm_map_ram(&page, 1, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr); + + KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses through both pointers work. */ + *p_ptr = 0; + *v_ptr = 0; + + vm_unmap_ram(v_ptr, 1); + free_pages((unsigned long)p_ptr, 1); +} + +static void vmalloc_percpu(struct kunit *test) +{ + char __percpu *ptr; + int cpu; + + /* + * This test is specifically crafted for the software tag-based mode, + * the only tag-based mode that poisons percpu mappings. + */ + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); + + ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + char *c_ptr = per_cpu_ptr(ptr, cpu); + + KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); + + /* Make sure that in-bounds accesses don't crash the kernel. */ + *c_ptr = 0; + } + + free_percpu(ptr); +} + +/* + * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, + * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based + * modes. + */ +static void match_all_not_assigned(struct kunit *test) +{ + char *ptr; + struct page *pages; + int i, size, order; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + for (i = 0; i < 256; i++) { + size = prandom_u32_max(1024) + 1; + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + kfree(ptr); + } + + for (i = 0; i < 256; i++) { + order = prandom_u32_max(4) + 1; + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + free_pages((unsigned long)ptr, order); + } + + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + return; + + for (i = 0; i < 256; i++) { + size = prandom_u32_max(1024) + 1; + ptr = vmalloc(size); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + vfree(ptr); + } +} + +/* Check that 0xff works as a match-all pointer tag for tag-based modes. */ +static void match_all_ptr_tag(struct kunit *test) +{ + char *ptr; + u8 tag; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(128, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Backup the assigned tag. */ + tag = get_tag(ptr); + KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL); + + /* Reset the tag to 0xff.*/ + ptr = set_tag(ptr, KASAN_TAG_KERNEL); + + /* This access shouldn't trigger a KASAN report. */ + *ptr = 0; + + /* Recover the pointer tag and free. */ + ptr = set_tag(ptr, tag); + kfree(ptr); +} + +/* Check that there are no match-all memory tags for tag-based modes. */ +static void match_all_mem_tag(struct kunit *test) +{ + char *ptr; + int tag; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(128, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* For each possible tag value not matching the pointer tag. */ + for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) { + if (tag == get_tag(ptr)) + continue; + + /* Mark the first memory granule with the chosen memory tag. */ + kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag, false); + + /* This access must cause a KASAN report. */ + KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0); + } + + /* Recover the memory tag and free. */ + kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr), false); + kfree(ptr); +} + +static struct kunit_case kasan_kunit_test_cases[] = { + KUNIT_CASE(kmalloc_oob_right), + KUNIT_CASE(kmalloc_oob_left), + KUNIT_CASE(kmalloc_node_oob_right), + KUNIT_CASE(kmalloc_pagealloc_oob_right), + KUNIT_CASE(kmalloc_pagealloc_uaf), + KUNIT_CASE(kmalloc_pagealloc_invalid_free), + KUNIT_CASE(pagealloc_oob_right), + KUNIT_CASE(pagealloc_uaf), + KUNIT_CASE(kmalloc_large_oob_right), + KUNIT_CASE(krealloc_more_oob), + KUNIT_CASE(krealloc_less_oob), + KUNIT_CASE(krealloc_pagealloc_more_oob), + KUNIT_CASE(krealloc_pagealloc_less_oob), + KUNIT_CASE(krealloc_uaf), + KUNIT_CASE(kmalloc_oob_16), + KUNIT_CASE(kmalloc_uaf_16), + KUNIT_CASE(kmalloc_oob_in_memset), + KUNIT_CASE(kmalloc_oob_memset_2), + KUNIT_CASE(kmalloc_oob_memset_4), + KUNIT_CASE(kmalloc_oob_memset_8), + KUNIT_CASE(kmalloc_oob_memset_16), + KUNIT_CASE(kmalloc_memmove_negative_size), + KUNIT_CASE(kmalloc_memmove_invalid_size), + KUNIT_CASE(kmalloc_uaf), + KUNIT_CASE(kmalloc_uaf_memset), + KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kmalloc_uaf3), + KUNIT_CASE(kfree_via_page), + KUNIT_CASE(kfree_via_phys), + KUNIT_CASE(kmem_cache_oob), + KUNIT_CASE(kmem_cache_accounted), + KUNIT_CASE(kmem_cache_bulk), + KUNIT_CASE(kasan_global_oob_right), + KUNIT_CASE(kasan_global_oob_left), + KUNIT_CASE(kasan_stack_oob), + KUNIT_CASE(kasan_alloca_oob_left), + KUNIT_CASE(kasan_alloca_oob_right), + KUNIT_CASE(ksize_unpoisons_memory), + KUNIT_CASE(ksize_uaf), + KUNIT_CASE(kmem_cache_double_free), + KUNIT_CASE(kmem_cache_invalid_free), + KUNIT_CASE(kmem_cache_double_destroy), + KUNIT_CASE(kasan_memchr), + KUNIT_CASE(kasan_memcmp), + KUNIT_CASE(kasan_strings), + KUNIT_CASE(kasan_bitops_generic), + KUNIT_CASE(kasan_bitops_tags), + KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(vmalloc_helpers_tags), + KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), + KUNIT_CASE(vmalloc_percpu), + KUNIT_CASE(match_all_not_assigned), + KUNIT_CASE(match_all_ptr_tag), + KUNIT_CASE(match_all_mem_tag), + {} +}; + +static struct kunit_suite kasan_kunit_test_suite = { + .name = "kasan", + .init = kasan_test_init, + .test_cases = kasan_kunit_test_cases, + .exit = kasan_test_exit, +}; + +kunit_test_suite(kasan_kunit_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c new file mode 100644 index 000000000..e4ca82dc2 --- /dev/null +++ b/mm/kasan/kasan_test_module.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + */ + +#define pr_fmt(fmt) "kasan test: %s " fmt, __func__ + +#include +#include +#include +#include +#include + +#include "kasan.h" + +static noinline void __init copy_user_test(void) +{ + char *kmem; + char __user *usermem; + size_t size = 128 - KASAN_GRANULE_SIZE; + int __maybe_unused unused; + + kmem = kmalloc(size, GFP_KERNEL); + if (!kmem) + return; + + usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + if (IS_ERR(usermem)) { + pr_err("Failed to allocate user memory\n"); + kfree(kmem); + return; + } + + OPTIMIZER_HIDE_VAR(size); + + pr_info("out-of-bounds in copy_from_user()\n"); + unused = copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in copy_to_user()\n"); + unused = copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user()\n"); + unused = __copy_from_user(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user()\n"); + unused = __copy_to_user(usermem, kmem, size + 1); + + pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); + unused = __copy_from_user_inatomic(kmem, usermem, size + 1); + + pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); + unused = __copy_to_user_inatomic(usermem, kmem, size + 1); + + pr_info("out-of-bounds in strncpy_from_user()\n"); + unused = strncpy_from_user(kmem, usermem, size + 1); + + vm_munmap((unsigned long)usermem, PAGE_SIZE); + kfree(kmem); +} + +static struct kasan_rcu_info { + int i; + struct rcu_head rcu; +} *global_rcu_ptr; + +static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) +{ + struct kasan_rcu_info *fp = container_of(rp, + struct kasan_rcu_info, rcu); + + kfree(fp); + ((volatile struct kasan_rcu_info *)fp)->i; +} + +static noinline void __init kasan_rcu_uaf(void) +{ + struct kasan_rcu_info *ptr; + + pr_info("use-after-free in kasan_rcu_reclaim\n"); + ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); + if (!ptr) { + pr_err("Allocation failed\n"); + return; + } + + global_rcu_ptr = rcu_dereference_protected(ptr, NULL); + call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim); +} + +static noinline void __init kasan_workqueue_work(struct work_struct *work) +{ + kfree(work); +} + +static noinline void __init kasan_workqueue_uaf(void) +{ + struct workqueue_struct *workqueue; + struct work_struct *work; + + workqueue = create_workqueue("kasan_wq_test"); + if (!workqueue) { + pr_err("Allocation failed\n"); + return; + } + work = kmalloc(sizeof(struct work_struct), GFP_KERNEL); + if (!work) { + pr_err("Allocation failed\n"); + return; + } + + INIT_WORK(work, kasan_workqueue_work); + queue_work(workqueue, work); + destroy_workqueue(workqueue); + + pr_info("use-after-free on workqueue\n"); + ((volatile struct work_struct *)work)->data; +} + +static int __init test_kasan_module_init(void) +{ + /* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only + * report the first detected bug and panic the kernel if panic_on_warn + * is enabled. + */ + bool multishot = kasan_save_enable_multi_shot(); + + copy_user_test(); + kasan_rcu_uaf(); + kasan_workqueue_uaf(); + + kasan_restore_multi_shot(multishot); + return -EAGAIN; +} + +module_init(test_kasan_module_init); +MODULE_LICENSE("GPL"); diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c new file mode 100644 index 000000000..75585077e --- /dev/null +++ b/mm/kasan/quarantine.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KASAN quarantine. + * + * Author: Alexander Potapenko + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kasan.h" + +/* Data structure and operations for quarantine queues. */ + +/* + * Each queue is a single-linked list, which also stores the total size of + * objects inside of it. + */ +struct qlist_head { + struct qlist_node *head; + struct qlist_node *tail; + size_t bytes; + bool offline; +}; + +#define QLIST_INIT { NULL, NULL, 0 } + +static bool qlist_empty(struct qlist_head *q) +{ + return !q->head; +} + +static void qlist_init(struct qlist_head *q) +{ + q->head = q->tail = NULL; + q->bytes = 0; +} + +static void qlist_put(struct qlist_head *q, struct qlist_node *qlink, + size_t size) +{ + if (unlikely(qlist_empty(q))) + q->head = qlink; + else + q->tail->next = qlink; + q->tail = qlink; + qlink->next = NULL; + q->bytes += size; +} + +static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) +{ + if (unlikely(qlist_empty(from))) + return; + + if (qlist_empty(to)) { + *to = *from; + qlist_init(from); + return; + } + + to->tail->next = from->head; + to->tail = from->tail; + to->bytes += from->bytes; + + qlist_init(from); +} + +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) + +/* + * The object quarantine consists of per-cpu queues and a global queue, + * guarded by quarantine_lock. + */ +static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); + +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; +static DEFINE_RAW_SPINLOCK(quarantine_lock); +DEFINE_STATIC_SRCU(remove_cache_srcu); + +#ifdef CONFIG_PREEMPT_RT +struct cpu_shrink_qlist { + raw_spinlock_t lock; + struct qlist_head qlist; +}; + +static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock), +}; +#endif + +/* Maximum size of the global queue. */ +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; + +/* + * The fraction of physical memory the quarantine is allowed to occupy. + * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep + * the ratio low to avoid OOM. + */ +#define QUARANTINE_FRACTION 32 + +static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) +{ + return virt_to_slab(qlink)->slab_cache; +} + +static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) +{ + struct kasan_free_meta *free_info = + container_of(qlink, struct kasan_free_meta, + quarantine_link); + + return ((void *)free_info) - cache->kasan_info.free_meta_offset; +} + +static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) +{ + void *object = qlink_to_object(qlink, cache); + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); + unsigned long flags; + + if (IS_ENABLED(CONFIG_SLAB)) + local_irq_save(flags); + + /* + * If init_on_free is enabled and KASAN's free metadata is stored in + * the object, zero the metadata. Otherwise, the object's memory will + * not be properly zeroed, as KASAN saves the metadata after the slab + * allocator zeroes the object. + */ + if (slab_want_init_on_free(cache) && + cache->kasan_info.free_meta_offset == 0) + memzero_explicit(meta, sizeof(*meta)); + + /* + * As the object now gets freed from the quarantine, assume that its + * free track is no longer valid. + */ + *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE; + + ___cache_free(cache, object, _THIS_IP_); + + if (IS_ENABLED(CONFIG_SLAB)) + local_irq_restore(flags); +} + +static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) +{ + struct qlist_node *qlink; + + if (unlikely(qlist_empty(q))) + return; + + qlink = q->head; + while (qlink) { + struct kmem_cache *obj_cache = + cache ? cache : qlink_to_cache(qlink); + struct qlist_node *next = qlink->next; + + qlink_free(qlink, obj_cache); + qlink = next; + } + qlist_init(q); +} + +bool kasan_quarantine_put(struct kmem_cache *cache, void *object) +{ + unsigned long flags; + struct qlist_head *q; + struct qlist_head temp = QLIST_INIT; + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); + + /* + * If there's no metadata for this object, don't put it into + * quarantine. + */ + if (!meta) + return false; + + /* + * Note: irq must be disabled until after we move the batch to the + * global quarantine. Otherwise kasan_quarantine_remove_cache() can + * miss some objects belonging to the cache if they are in our local + * temp list. kasan_quarantine_remove_cache() executes on_each_cpu() + * at the beginning which ensures that it either sees the objects in + * per-cpu lists or in the global quarantine. + */ + local_irq_save(flags); + + q = this_cpu_ptr(&cpu_quarantine); + if (q->offline) { + local_irq_restore(flags); + return false; + } + qlist_put(q, &meta->quarantine_link, cache->size); + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { + qlist_move_all(q, &temp); + + raw_spin_lock(&quarantine_lock); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } + raw_spin_unlock(&quarantine_lock); + } + + local_irq_restore(flags); + + return true; +} + +void kasan_quarantine_reduce(void) +{ + size_t total_size, new_quarantine_size, percpu_quarantines; + unsigned long flags; + int srcu_idx; + struct qlist_head to_free = QLIST_INIT; + + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) + return; + + /* + * srcu critical section ensures that kasan_quarantine_remove_cache() + * will not miss objects belonging to the cache while they are in our + * local to_free list. srcu is chosen because (1) it gives us private + * grace period domain that does not interfere with anything else, + * and (2) it allows synchronize_srcu() to return without waiting + * if there are no pending read critical sections (which is the + * expected case). + */ + srcu_idx = srcu_read_lock(&remove_cache_srcu); + raw_spin_lock_irqsave(&quarantine_lock, flags); + + /* + * Update quarantine size in case of hotplug. Allocate a fraction of + * the installed memory to quarantine minus per-cpu queue limits. + */ + total_size = (totalram_pages() << PAGE_SHIFT) / + QUARANTINE_FRACTION; + percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); + + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; + } + + raw_spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, NULL); + srcu_read_unlock(&remove_cache_srcu, srcu_idx); +} + +static void qlist_move_cache(struct qlist_head *from, + struct qlist_head *to, + struct kmem_cache *cache) +{ + struct qlist_node *curr; + + if (unlikely(qlist_empty(from))) + return; + + curr = from->head; + qlist_init(from); + while (curr) { + struct qlist_node *next = curr->next; + struct kmem_cache *obj_cache = qlink_to_cache(curr); + + if (obj_cache == cache) + qlist_put(to, curr, obj_cache->size); + else + qlist_put(from, curr, obj_cache->size); + + curr = next; + } +} + +#ifndef CONFIG_PREEMPT_RT +static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) +{ + struct kmem_cache *cache = arg; + struct qlist_head to_free = QLIST_INIT; + + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} +#else +static void __per_cpu_remove_cache(struct qlist_head *q, void *arg) +{ + struct kmem_cache *cache = arg; + unsigned long flags; + struct cpu_shrink_qlist *sq; + + sq = this_cpu_ptr(&shrink_qlist); + raw_spin_lock_irqsave(&sq->lock, flags); + qlist_move_cache(q, &sq->qlist, cache); + raw_spin_unlock_irqrestore(&sq->lock, flags); +} +#endif + +static void per_cpu_remove_cache(void *arg) +{ + struct qlist_head *q; + + q = this_cpu_ptr(&cpu_quarantine); + /* + * Ensure the ordering between the writing to q->offline and + * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted + * by interrupt. + */ + if (READ_ONCE(q->offline)) + return; + __per_cpu_remove_cache(q, arg); +} + +/* Free all quarantined objects belonging to cache. */ +void kasan_quarantine_remove_cache(struct kmem_cache *cache) +{ + unsigned long flags, i; + struct qlist_head to_free = QLIST_INIT; + + /* + * Must be careful to not miss any objects that are being moved from + * per-cpu list to the global quarantine in kasan_quarantine_put(), + * nor objects being freed in kasan_quarantine_reduce(). on_each_cpu() + * achieves the first goal, while synchronize_srcu() achieves the + * second. + */ + on_each_cpu(per_cpu_remove_cache, cache, 1); + +#ifdef CONFIG_PREEMPT_RT + { + int cpu; + struct cpu_shrink_qlist *sq; + + for_each_online_cpu(cpu) { + sq = per_cpu_ptr(&shrink_qlist, cpu); + raw_spin_lock_irqsave(&sq->lock, flags); + qlist_move_cache(&sq->qlist, &to_free, cache); + raw_spin_unlock_irqrestore(&sq->lock, flags); + } + qlist_free_all(&to_free, cache); + } +#endif + + raw_spin_lock_irqsave(&quarantine_lock, flags); + for (i = 0; i < QUARANTINE_BATCHES; i++) { + if (qlist_empty(&global_quarantine[i])) + continue; + qlist_move_cache(&global_quarantine[i], &to_free, cache); + /* Scanning whole quarantine can take a while. */ + raw_spin_unlock_irqrestore(&quarantine_lock, flags); + cond_resched(); + raw_spin_lock_irqsave(&quarantine_lock, flags); + } + raw_spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, cache); + + synchronize_srcu(&remove_cache_srcu); +} + +static int kasan_cpu_online(unsigned int cpu) +{ + this_cpu_ptr(&cpu_quarantine)->offline = false; + return 0; +} + +static int kasan_cpu_offline(unsigned int cpu) +{ + struct qlist_head *q; + + q = this_cpu_ptr(&cpu_quarantine); + /* Ensure the ordering between the writing to q->offline and + * qlist_free_all. Otherwise, cpu_quarantine may be corrupted + * by interrupt. + */ + WRITE_ONCE(q->offline, true); + barrier(); + qlist_free_all(q, NULL); + return 0; +} + +static int __init kasan_cpu_quarantine_init(void) +{ + int ret = 0; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online", + kasan_cpu_online, kasan_cpu_offline); + if (ret < 0) + pr_err("kasan cpu quarantine register failed [%d]\n", ret); + return ret; +} +late_initcall(kasan_cpu_quarantine_init); diff --git a/mm/kasan/report.c b/mm/kasan/report.c new file mode 100644 index 000000000..5d9ae80df --- /dev/null +++ b/mm/kasan/report.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common KASAN error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "kasan.h" +#include "../slab.h" + +static unsigned long kasan_flags; + +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 + +enum kasan_arg_fault { + KASAN_ARG_FAULT_DEFAULT, + KASAN_ARG_FAULT_REPORT, + KASAN_ARG_FAULT_PANIC, +}; + +static enum kasan_arg_fault kasan_arg_fault __ro_after_init = KASAN_ARG_FAULT_DEFAULT; + +/* kasan.fault=report/panic */ +static int __init early_kasan_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kasan_arg_fault = KASAN_ARG_FAULT_REPORT; + else if (!strcmp(arg, "panic")) + kasan_arg_fault = KASAN_ARG_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.fault", early_kasan_fault); + +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +/* + * Used to suppress reports within kasan_disable/enable_current() critical + * sections, which are used for marking accesses to slab metadata. + */ +static bool report_suppressed(void) +{ +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + if (current->kasan_depth) + return true; +#endif + return false; +} + +/* + * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot + * is enabled. Note that KASAN tests effectively enable kasan_multi_shot + * for their duration. + */ +static bool report_enabled(void) +{ + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) + +bool kasan_save_enable_multi_shot(void) +{ + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); + +void kasan_restore_multi_shot(bool enabled) +{ + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); + +#endif + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) +static void update_kunit_status(bool sync) +{ + struct kunit *test; + struct kunit_resource *resource; + struct kunit_kasan_status *status; + + test = current->kunit_test; + if (!test) + return; + + resource = kunit_find_named_resource(test, "kasan_status"); + if (!resource) { + kunit_set_failure(test); + return; + } + + status = (struct kunit_kasan_status *)resource->data; + WRITE_ONCE(status->report_found, true); + WRITE_ONCE(status->sync_fault, sync); + + kunit_put_resource(resource); +} +#else +static void update_kunit_status(bool sync) { } +#endif + +static DEFINE_SPINLOCK(report_lock); + +static void start_report(unsigned long *flags, bool sync) +{ + /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ + disable_trace_on_warning(); + /* Update status of the currently running KASAN test. */ + update_kunit_status(sync); + /* Do not allow LOCKDEP mangling KASAN reports. */ + lockdep_off(); + /* Make sure we don't end up in loop. */ + kasan_disable_current(); + spin_lock_irqsave(&report_lock, *flags); + pr_err("==================================================================\n"); +} + +static void end_report(unsigned long *flags, void *addr) +{ + if (addr) + trace_error_report_end(ERROR_DETECTOR_KASAN, + (unsigned long)addr); + pr_err("==================================================================\n"); + spin_unlock_irqrestore(&report_lock, *flags); + if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + check_panic_on_warn("KASAN"); + if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) + panic("kasan.fault=panic set ...\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + lockdep_on(); + kasan_enable_current(); +} + +static void print_error_description(struct kasan_report_info *info) +{ + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); + + if (info->type != KASAN_REPORT_ACCESS) { + pr_err("Free of addr %px by task %s/%d\n", + info->access_addr, current->comm, task_pid_nr(current)); + return; + } + + if (info->access_size) + pr_err("%s of size %zu at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); + else + pr_err("%s at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_addr, current->comm, task_pid_nr(current)); +} + +static void print_track(struct kasan_track *track, const char *prefix) +{ + pr_err("%s by task %u:\n", prefix, track->pid); + if (track->stack) + stack_depot_print(track->stack); + else + pr_err("(stack is not available)\n"); +} + +static inline struct page *addr_to_page(const void *addr) +{ + if (virt_addr_valid(addr)) + return virt_to_head_page(addr); + return NULL; +} + +static void describe_object_addr(const void *addr, struct kmem_cache *cache, + void *object) +{ + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; + + pr_err("The buggy address belongs to the object at %px\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); + + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; + } + + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%px, %px)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); +} + +static void describe_object_stacks(struct kasan_report_info *info) +{ + if (info->alloc_track.stack) { + print_track(&info->alloc_track, "Allocated"); + pr_err("\n"); + } + + if (info->free_track.stack) { + print_track(&info->free_track, "Freed"); + pr_err("\n"); + } + + kasan_print_aux_stacks(info->cache, info->object); +} + +static void describe_object(const void *addr, struct kasan_report_info *info) +{ + if (kasan_stack_collection_enabled()) + describe_object_stacks(info); + describe_object_addr(addr, info->cache, info->object); +} + +static inline bool kernel_or_module_addr(const void *addr) +{ + if (is_kernel((unsigned long)addr)) + return true; + if (is_module_address((unsigned long)addr)) + return true; + return false; +} + +static inline bool init_task_stack_addr(const void *addr) +{ + return addr >= (void *)&init_thread_union.stack && + (addr <= (void *)&init_thread_union.stack + + sizeof(init_thread_union.stack)); +} + +static void print_address_description(void *addr, u8 tag, + struct kasan_report_info *info) +{ + struct page *page = addr_to_page(addr); + + dump_stack_lvl(KERN_ERR); + pr_err("\n"); + + if (info->cache && info->object) { + describe_object(addr, info); + pr_err("\n"); + } + + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + pr_err("\n"); + } + + if (object_is_on_stack(addr)) { + /* + * Currently, KASAN supports printing frame information only + * for accesses to the task's own stack. + */ + kasan_print_address_stack_frame(addr); + pr_err("\n"); + } + + if (is_vmalloc_addr(addr)) { + struct vm_struct *va = find_vm_area(addr); + + if (va) { + pr_err("The buggy address belongs to the virtual mapping at\n" + " [%px, %px) created by:\n" + " %pS\n", + va->addr, va->addr + va->size, va->caller); + pr_err("\n"); + + page = vmalloc_to_page(addr); + } + } + + if (page) { + pr_err("The buggy address belongs to the physical page:\n"); + dump_page(page, "kasan: bad access detected"); + pr_err("\n"); + } +} + +static bool meta_row_is_guilty(const void *row, const void *addr) +{ + return (row <= addr) && (addr < row + META_MEM_BYTES_PER_ROW); +} + +static int meta_pointer_offset(const void *row, const void *addr) +{ + /* + * Memory state around the buggy address: + * ff00ff00ff00ff00: 00 00 00 05 fe fe fe fe fe fe fe fe fe fe fe fe + * ... + * + * The length of ">ff00ff00ff00ff00: " is + * 3 + (BITS_PER_LONG / 8) * 2 chars. + * The length of each granule metadata is 2 bytes + * plus 1 byte for space. + */ + return 3 + (BITS_PER_LONG / 8) * 2 + + (addr - row) / KASAN_GRANULE_SIZE * 3 + 1; +} + +static void print_memory_metadata(const void *addr) +{ + int i; + void *row; + + row = (void *)round_down((unsigned long)addr, META_MEM_BYTES_PER_ROW) + - META_ROWS_AROUND_ADDR * META_MEM_BYTES_PER_ROW; + + pr_err("Memory state around the buggy address:\n"); + + for (i = -META_ROWS_AROUND_ADDR; i <= META_ROWS_AROUND_ADDR; i++) { + char buffer[4 + (BITS_PER_LONG / 8) * 2]; + char metadata[META_BYTES_PER_ROW]; + + snprintf(buffer, sizeof(buffer), + (i == 0) ? ">%px: " : " %px: ", row); + + /* + * We should not pass a shadow pointer to generic + * function, because generic functions may try to + * access kasan mapping for the passed address. + */ + kasan_metadata_fetch_row(&metadata[0], row); + + print_hex_dump(KERN_ERR, buffer, + DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1, + metadata, META_BYTES_PER_ROW, 0); + + if (meta_row_is_guilty(row, addr)) + pr_err("%*c\n", meta_pointer_offset(row, addr), '^'); + + row += META_MEM_BYTES_PER_ROW; + } +} + +static void print_report(struct kasan_report_info *info) +{ + void *addr = kasan_reset_tag(info->access_addr); + u8 tag = get_tag(info->access_addr); + + print_error_description(info); + if (addr_has_metadata(addr)) + kasan_print_tags(tag, info->first_bad_addr); + pr_err("\n"); + + if (addr_has_metadata(addr)) { + print_address_description(addr, tag, info); + print_memory_metadata(info->first_bad_addr); + } else { + dump_stack_lvl(KERN_ERR); + } +} + +static void complete_report_info(struct kasan_report_info *info) +{ + void *addr = kasan_reset_tag(info->access_addr); + struct slab *slab; + + if (info->type == KASAN_REPORT_ACCESS) + info->first_bad_addr = kasan_find_first_bad_addr( + info->access_addr, info->access_size); + else + info->first_bad_addr = addr; + + slab = kasan_addr_to_slab(addr); + if (slab) { + info->cache = slab->slab_cache; + info->object = nearest_obj(info->cache, slab, addr); + } else + info->cache = info->object = NULL; + + switch (info->type) { + case KASAN_REPORT_INVALID_FREE: + info->bug_type = "invalid-free"; + break; + case KASAN_REPORT_DOUBLE_FREE: + info->bug_type = "double-free"; + break; + default: + /* bug_type filled in by kasan_complete_mode_report_info. */ + break; + } + + /* Fill in mode-specific report info fields. */ + kasan_complete_mode_report_info(info); +} + +void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) +{ + unsigned long flags; + struct kasan_report_info info; + + /* + * Do not check report_suppressed(), as an invalid-free cannot be + * caused by accessing slab metadata and thus should not be + * suppressed by kasan_disable/enable_current() critical sections. + */ + if (unlikely(!report_enabled())) + return; + + start_report(&flags, true); + + memset(&info, 0, sizeof(info)); + info.type = type; + info.access_addr = ptr; + info.access_size = 0; + info.is_write = false; + info.ip = ip; + + complete_report_info(&info); + + print_report(&info); + + end_report(&flags, ptr); +} + +/* + * kasan_report() is the only reporting function that uses + * user_access_save/restore(): kasan_report_invalid_free() cannot be called + * from a UACCESS region, and kasan_report_async() is not used on x86. + */ +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + bool ret = true; + void *ptr = (void *)addr; + unsigned long ua_flags = user_access_save(); + unsigned long irq_flags; + struct kasan_report_info info; + + if (unlikely(report_suppressed()) || unlikely(!report_enabled())) { + ret = false; + goto out; + } + + start_report(&irq_flags, true); + + memset(&info, 0, sizeof(info)); + info.type = KASAN_REPORT_ACCESS; + info.access_addr = ptr; + info.access_size = size; + info.is_write = is_write; + info.ip = ip; + + complete_report_info(&info); + + print_report(&info); + + end_report(&irq_flags, ptr); + +out: + user_access_restore(ua_flags); + + return ret; +} + +#ifdef CONFIG_KASAN_HW_TAGS +void kasan_report_async(void) +{ + unsigned long flags; + + /* + * Do not check report_suppressed(), as kasan_disable/enable_current() + * critical sections do not affect Hardware Tag-Based KASAN. + */ + if (unlikely(!report_enabled())) + return; + + start_report(&flags, false); + pr_err("BUG: KASAN: invalid-access\n"); + pr_err("Asynchronous fault: no details available\n"); + pr_err("\n"); + dump_stack_lvl(KERN_ERR); + end_report(&flags, NULL); +} +#endif /* CONFIG_KASAN_HW_TAGS */ + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +/* + * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high + * canonical half of the address space) cause out-of-bounds shadow memory reads + * before the actual access. For addresses in the low canonical half of the + * address space, as well as most non-canonical addresses, that out-of-bounds + * shadow memory access lands in the non-canonical part of the address space. + * Help the user figure out what the original bogus pointer was. + */ +void kasan_non_canonical_hook(unsigned long addr) +{ + unsigned long orig_addr; + const char *bug_type; + + if (addr < KASAN_SHADOW_OFFSET) + return; + + orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT; + /* + * For faults near the shadow address for NULL, we can be fairly certain + * that this is a KASAN shadow memory access. + * For faults that correspond to shadow for low canonical addresses, we + * can still be pretty sure - that shadow region is a fairly narrow + * chunk of the non-canonical address space. + * But faults that look like shadow for non-canonical addresses are a + * really large chunk of the address space. In that case, we still + * print the decoded address, but make it clear that this is not + * necessarily what's actually going on. + */ + if (orig_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if (orig_addr < TASK_SIZE) + bug_type = "probably user-memory-access"; + else + bug_type = "maybe wild-memory-access"; + pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type, + orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1); +} +#endif diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c new file mode 100644 index 000000000..043c94b04 --- /dev/null +++ b/mm/kasan/report_generic.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains generic KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kasan.h" +#include "../slab.h" + +void *kasan_find_first_bad_addr(void *addr, size_t size) +{ + void *p = addr; + + if (!addr_has_metadata(p)) + return p; + + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) + p += KASAN_GRANULE_SIZE; + + return p; +} + +static const char *get_shadow_bug_type(struct kasan_report_info *info) +{ + const char *bug_type = "unknown-crash"; + u8 *shadow_addr; + + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); + + /* + * If shadow byte value is in [0, KASAN_GRANULE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_GRANULE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { + case 0 ... KASAN_GRANULE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ + bug_type = "out-of-bounds"; + break; + case KASAN_PAGE_REDZONE: + case KASAN_SLAB_REDZONE: + bug_type = "slab-out-of-bounds"; + break; + case KASAN_GLOBAL_REDZONE: + bug_type = "global-out-of-bounds"; + break; + case KASAN_STACK_LEFT: + case KASAN_STACK_MID: + case KASAN_STACK_RIGHT: + case KASAN_STACK_PARTIAL: + bug_type = "stack-out-of-bounds"; + break; + case KASAN_PAGE_FREE: + case KASAN_SLAB_FREE: + case KASAN_SLAB_FREETRACK: + bug_type = "use-after-free"; + break; + case KASAN_ALLOCA_LEFT: + case KASAN_ALLOCA_RIGHT: + bug_type = "alloca-out-of-bounds"; + break; + case KASAN_VMALLOC_INVALID: + bug_type = "vmalloc-out-of-bounds"; + break; + } + + return bug_type; +} + +static const char *get_wild_bug_type(struct kasan_report_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static const char *get_bug_type(struct kasan_report_info *info) +{ + /* + * If access_size is a negative number, then it has reason to be + * defined as out-of-bounds bug type. + * + * Casting negative numbers to size_t would indeed turn up as + * a large size_t and its value will be larger than ULONG_MAX/2, + * so that this can qualify as out-of-bounds. + */ + if (info->access_addr + info->access_size < info->access_addr) + return "out-of-bounds"; + + if (addr_has_metadata(info->access_addr)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +void kasan_complete_mode_report_info(struct kasan_report_info *info) +{ + struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; + + if (!info->bug_type) + info->bug_type = get_bug_type(info); + + if (!info->cache || !info->object) + return; + + alloc_meta = kasan_get_alloc_meta(info->cache, info->object); + if (alloc_meta) + memcpy(&info->alloc_track, &alloc_meta->alloc_track, + sizeof(info->alloc_track)); + + if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) { + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + free_meta = kasan_get_free_meta(info->cache, info->object); + memcpy(&info->free_track, &free_meta->free_track, + sizeof(info->free_track)); + } +} + +void kasan_metadata_fetch_row(char *buffer, void *row) +{ + memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); +} + +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[0]); + pr_err("\n"); + } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +} + +#ifdef CONFIG_KASAN_STACK +static bool __must_check tokenize_frame_descr(const char **frame_descr, + char *token, size_t max_tok_len, + unsigned long *value) +{ + const char *sep = strchr(*frame_descr, ' '); + + if (sep == NULL) + sep = *frame_descr + strlen(*frame_descr); + + if (token != NULL) { + const size_t tok_len = sep - *frame_descr; + + if (tok_len + 1 > max_tok_len) { + pr_err("KASAN internal error: frame description too long: %s\n", + *frame_descr); + return false; + } + + /* Copy token (+ 1 byte for '\0'). */ + strscpy(token, *frame_descr, tok_len + 1); + } + + /* Advance frame_descr past separator. */ + *frame_descr = sep + 1; + + if (value != NULL && kstrtoul(token, 10, value)) { + pr_err("KASAN internal error: not a valid number: %s\n", token); + return false; + } + + return true; +} + +static void print_decoded_frame_descr(const char *frame_descr) +{ + /* + * We need to parse the following string: + * "n alloc_1 alloc_2 ... alloc_n" + * where alloc_i looks like + * "offset size len name" + * or "offset size len name:line". + */ + + char token[64]; + unsigned long num_objects; + + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &num_objects)) + return; + + pr_err("\n"); + pr_err("This frame has %lu %s:\n", num_objects, + num_objects == 1 ? "object" : "objects"); + + while (num_objects--) { + unsigned long offset; + unsigned long size; + + /* access offset */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &offset)) + return; + /* access size */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &size)) + return; + /* name length (unused) */ + if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL)) + return; + /* object name */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + NULL)) + return; + + /* Strip line number; without filename it's not very helpful. */ + strreplace(token, ':', '\0'); + + /* Finally, print object information. */ + pr_err(" [%lu, %lu) '%s'", offset, offset + size, token); + } +} + +/* Returns true only if the address is on the current task's stack. */ +static bool __must_check get_address_stack_frame_info(const void *addr, + unsigned long *offset, + const char **frame_descr, + const void **frame_pc) +{ + unsigned long aligned_addr; + unsigned long mem_ptr; + const u8 *shadow_bottom; + const u8 *shadow_ptr; + const unsigned long *frame; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); + + aligned_addr = round_down((unsigned long)addr, sizeof(long)); + mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE); + shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); + shadow_bottom = kasan_mem_to_shadow(end_of_stack(current)); + + while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_GRANULE_SIZE; + } + + while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_GRANULE_SIZE; + } + + if (shadow_ptr < shadow_bottom) + return false; + + frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE); + if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { + pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", + frame[0]); + return false; + } + + *offset = (unsigned long)addr - (unsigned long)frame; + *frame_descr = (const char *)frame[1]; + *frame_pc = (void *)frame[2]; + + return true; +} + +void kasan_print_address_stack_frame(const void *addr) +{ + unsigned long offset; + const char *frame_descr; + const void *frame_pc; + + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); + + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, + &frame_pc)) + return; + + pr_err(" and is located at offset %lu in frame:\n", offset); + pr_err(" %pS\n", frame_pc); + + if (!frame_descr) + return; + + print_decoded_frame_descr(frame_descr); +} +#endif /* CONFIG_KASAN_STACK */ + +#define DEFINE_ASAN_REPORT_LOAD(size) \ +void __asan_report_load##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, false, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_load##size##_noabort) + +#define DEFINE_ASAN_REPORT_STORE(size) \ +void __asan_report_store##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, true, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_store##size##_noabort) + +DEFINE_ASAN_REPORT_LOAD(1); +DEFINE_ASAN_REPORT_LOAD(2); +DEFINE_ASAN_REPORT_LOAD(4); +DEFINE_ASAN_REPORT_LOAD(8); +DEFINE_ASAN_REPORT_LOAD(16); +DEFINE_ASAN_REPORT_STORE(1); +DEFINE_ASAN_REPORT_STORE(2); +DEFINE_ASAN_REPORT_STORE(4); +DEFINE_ASAN_REPORT_STORE(8); +DEFINE_ASAN_REPORT_STORE(16); + +void __asan_report_load_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_load_n_noabort); + +void __asan_report_store_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c new file mode 100644 index 000000000..f3d3be614 --- /dev/null +++ b/mm/kasan/report_hw_tags.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains hardware tag-based KASAN specific error reporting code. + * + * Copyright (c) 2020 Google, Inc. + * Author: Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include + +#include "kasan.h" + +void *kasan_find_first_bad_addr(void *addr, size_t size) +{ + /* Return the same value regardless of whether addr_has_metadata(). */ + return kasan_reset_tag(addr); +} + +void kasan_metadata_fetch_row(char *buffer, void *row) +{ + int i; + + for (i = 0; i < META_BYTES_PER_ROW; i++) + buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE); +} + +void kasan_print_tags(u8 addr_tag, const void *addr) +{ + u8 memory_tag = hw_get_mem_tag((void *)addr); + + pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", + addr_tag, memory_tag); +} diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c new file mode 100644 index 000000000..7a2639729 --- /dev/null +++ b/mm/kasan/report_sw_tags.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains software tag-based KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kasan.h" +#include "../slab.h" + +void *kasan_find_first_bad_addr(void *addr, size_t size) +{ + u8 tag = get_tag(addr); + void *p = kasan_reset_tag(addr); + void *end = p + size; + + if (!addr_has_metadata(p)) + return p; + + while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) + p += KASAN_GRANULE_SIZE; + + return p; +} + +void kasan_metadata_fetch_row(char *buffer, void *row) +{ + memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); +} + +void kasan_print_tags(u8 addr_tag, const void *addr) +{ + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); + + pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow); +} + +#ifdef CONFIG_KASAN_STACK +void kasan_print_address_stack_frame(const void *addr) +{ + if (WARN_ON(!object_is_on_stack(addr))) + return; + + pr_err("The buggy address belongs to stack of task %s/%d\n", + current->comm, task_pid_nr(current)); +} +#endif diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c new file mode 100644 index 000000000..ecede06ef --- /dev/null +++ b/mm/kasan/report_tags.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Copyright (c) 2020 Google, Inc. + */ + +#include + +#include "kasan.h" + +extern struct kasan_stack_ring stack_ring; + +static const char *get_common_bug_type(struct kasan_report_info *info) +{ + /* + * If access_size is a negative number, then it has reason to be + * defined as out-of-bounds bug type. + * + * Casting negative numbers to size_t would indeed turn up as + * a large size_t and its value will be larger than ULONG_MAX/2, + * so that this can qualify as out-of-bounds. + */ + if (info->access_addr + info->access_size < info->access_addr) + return "out-of-bounds"; + + return "invalid-access"; +} + +void kasan_complete_mode_report_info(struct kasan_report_info *info) +{ + unsigned long flags; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *ptr; + u32 pid; + depot_stack_handle_t stack; + bool is_free; + bool alloc_found = false, free_found = false; + + if ((!info->cache || !info->object) && !info->bug_type) { + info->bug_type = get_common_bug_type(info); + return; + } + + write_lock_irqsave(&stack_ring.lock, flags); + + pos = atomic64_read(&stack_ring.pos); + + /* + * The loop below tries to find stack ring entries relevant to the + * buggy object. This is a best-effort process. + * + * First, another object with the same tag can be allocated in place of + * the buggy object. Also, since the number of entries is limited, the + * entries relevant to the buggy object can be overwritten. + */ + + for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) { + if (alloc_found && free_found) + break; + + entry = &stack_ring.entries[i % stack_ring.size]; + + /* Paired with smp_store_release() in save_stack_info(). */ + ptr = (void *)smp_load_acquire(&entry->ptr); + + if (kasan_reset_tag(ptr) != info->object || + get_tag(ptr) != get_tag(info->access_addr)) + continue; + + pid = READ_ONCE(entry->pid); + stack = READ_ONCE(entry->stack); + is_free = READ_ONCE(entry->is_free); + + if (is_free) { + /* + * Second free of the same object. + * Give up on trying to find the alloc entry. + */ + if (free_found) + break; + + info->free_track.pid = pid; + info->free_track.stack = stack; + free_found = true; + + /* + * If a free entry is found first, the bug is likely + * a use-after-free. + */ + if (!info->bug_type) + info->bug_type = "use-after-free"; + } else { + /* Second alloc of the same object. Give up. */ + if (alloc_found) + break; + + info->alloc_track.pid = pid; + info->alloc_track.stack = stack; + alloc_found = true; + + /* + * If an alloc entry is found first, the bug is likely + * an out-of-bounds. + */ + if (!info->bug_type) + info->bug_type = "slab-out-of-bounds"; + } + } + + write_unlock_irqrestore(&stack_ring.lock, flags); + + /* Assign the common bug type if no entries were found. */ + if (!info->bug_type) + info->bug_type = get_common_bug_type(info); +} diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c new file mode 100644 index 000000000..ecb7acb38 --- /dev/null +++ b/mm/kasan/shadow.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains KASAN runtime code that manages shadow memory for + * generic and software tag-based KASAN modes. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "kasan.h" + +bool __kasan_check_read(const volatile void *p, unsigned int size) +{ + return kasan_check_range((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__kasan_check_read); + +bool __kasan_check_write(const volatile void *p, unsigned int size) +{ + return kasan_check_range((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__kasan_check_write); + +#undef memset +void *memset(void *addr, int c, size_t len) +{ + if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) + return NULL; + + return __memset(addr, c, len); +} + +#ifdef __HAVE_ARCH_MEMMOVE +#undef memmove +void *memmove(void *dest, const void *src, size_t len) +{ + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memmove(dest, src, len); +} +#endif + +#undef memcpy +void *memcpy(void *dest, const void *src, size_t len) +{ + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memcpy(dest, src, len); +} + +void kasan_poison(const void *addr, size_t size, u8 value, bool init) +{ + void *shadow_start, *shadow_end; + + if (!kasan_arch_is_ready()) + return; + + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_poison_object_data) pass tagged + * addresses to this function. + */ + addr = kasan_reset_tag(addr); + + /* Skip KFENCE memory if called explicitly outside of sl*b. */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + if (WARN_ON(size & KASAN_GRANULE_MASK)) + return; + + shadow_start = kasan_mem_to_shadow(addr); + shadow_end = kasan_mem_to_shadow(addr + size); + + __memset(shadow_start, value, shadow_end - shadow_start); +} +EXPORT_SYMBOL(kasan_poison); + +#ifdef CONFIG_KASAN_GENERIC +void kasan_poison_last_granule(const void *addr, size_t size) +{ + if (!kasan_arch_is_ready()) + return; + + if (size & KASAN_GRANULE_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size); + *shadow = size & KASAN_GRANULE_MASK; + } +} +#endif + +void kasan_unpoison(const void *addr, size_t size, bool init) +{ + u8 tag = get_tag(addr); + + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_unpoison_object_data) pass tagged + * addresses to this function. + */ + addr = kasan_reset_tag(addr); + + /* + * Skip KFENCE memory if called explicitly outside of sl*b. Also note + * that calls to ksize(), where size is not a multiple of machine-word + * size, would otherwise poison the invalid portion of the word. + */ + if (is_kfence_address(addr)) + return; + + if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK)) + return; + + /* Unpoison all granules that cover the object. */ + kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag, false); + + /* Partially poison the last granule for the generic mode. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + kasan_poison_last_granule(addr, size); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static bool shadow_mapped(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (pgd_none(*pgd)) + return false; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return false; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return false; + + /* + * We can't use pud_large() or pud_huge(), the first one is + * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse + * pud_bad(), if pud is bad then it's bad because it's huge. + */ + if (pud_bad(*pud)) + return true; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + + if (pmd_bad(*pmd)) + return true; + pte = pte_offset_kernel(pmd, addr); + return !pte_none(*pte); +} + +static int __meminit kasan_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct memory_notify *mem_data = data; + unsigned long nr_shadow_pages, start_kaddr, shadow_start; + unsigned long shadow_end, shadow_size; + + nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT; + start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn); + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr); + shadow_size = nr_shadow_pages << PAGE_SHIFT; + shadow_end = shadow_start + shadow_size; + + if (WARN_ON(mem_data->nr_pages % KASAN_GRANULE_SIZE) || + WARN_ON(start_kaddr % KASAN_MEMORY_PER_SHADOW_PAGE)) + return NOTIFY_BAD; + + switch (action) { + case MEM_GOING_ONLINE: { + void *ret; + + /* + * If shadow is mapped already than it must have been mapped + * during the boot. This could happen if we onlining previously + * offlined memory. + */ + if (shadow_mapped(shadow_start)) + return NOTIFY_OK; + + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, + shadow_end, GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, + pfn_to_nid(mem_data->start_pfn), + __builtin_return_address(0)); + if (!ret) + return NOTIFY_BAD; + + kmemleak_ignore(ret); + return NOTIFY_OK; + } + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: { + struct vm_struct *vm; + + /* + * shadow_start was either mapped during boot by kasan_init() + * or during memory online by __vmalloc_node_range(). + * In the latter case we can use vfree() to free shadow. + * Non-NULL result of the find_vm_area() will tell us if + * that was the second case. + * + * Currently it's not possible to free shadow mapped + * during boot by kasan_init(). It's because the code + * to do that hasn't been written yet. So we'll just + * leak the memory. + */ + vm = find_vm_area((void *)shadow_start); + if (vm) + vfree((void *)shadow_start); + } + } + + return NOTIFY_OK; +} + +static int __init kasan_memhotplug_init(void) +{ + hotplug_memory_notifier(kasan_mem_notifier, 0); + + return 0; +} + +core_initcall(kasan_memhotplug_init); +#endif + +#ifdef CONFIG_KASAN_VMALLOC + +void __init __weak kasan_populate_early_vm_area_shadow(void *start, + unsigned long size) +{ +} + +static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + pte_t pte; + + if (likely(!pte_none(*ptep))) + return 0; + + page = __get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + + spin_lock(&init_mm.page_table_lock); + if (likely(pte_none(*ptep))) { + set_pte_at(&init_mm, addr, ptep, pte); + page = 0; + } + spin_unlock(&init_mm.page_table_lock); + if (page) + free_page(page); + return 0; +} + +int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +{ + unsigned long shadow_start, shadow_end; + int ret; + + if (!kasan_arch_is_ready()) + return 0; + + if (!is_vmalloc_or_module_addr((void *)addr)) + return 0; + + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr); + shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size); + + /* + * User Mode Linux maps enough shadow memory for all of virtual memory + * at boot, so doesn't need to allocate more on vmalloc, just clear it. + * + * The remaining CONFIG_UML checks in this file exist for the same + * reason. + */ + if (IS_ENABLED(CONFIG_UML)) { + __memset((void *)shadow_start, KASAN_VMALLOC_INVALID, shadow_end - shadow_start); + return 0; + } + + shadow_start = PAGE_ALIGN_DOWN(shadow_start); + shadow_end = PAGE_ALIGN(shadow_end); + + ret = apply_to_page_range(&init_mm, shadow_start, + shadow_end - shadow_start, + kasan_populate_vmalloc_pte, NULL); + if (ret) + return ret; + + flush_cache_vmap(shadow_start, shadow_end); + + /* + * We need to be careful about inter-cpu effects here. Consider: + * + * CPU#0 CPU#1 + * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ; + * p[99] = 1; + * + * With compiler instrumentation, that ends up looking like this: + * + * CPU#0 CPU#1 + * // vmalloc() allocates memory + * // let a = area->addr + * // we reach kasan_populate_vmalloc + * // and call kasan_unpoison: + * STORE shadow(a), unpoison_val + * ... + * STORE shadow(a+99), unpoison_val x = LOAD p + * // rest of vmalloc process + * STORE p, a LOAD shadow(x+99) + * + * If there is no barrier between the end of unpoisoning the shadow + * and the store of the result to p, the stores could be committed + * in a different order by CPU#0, and CPU#1 could erroneously observe + * poison in the shadow. + * + * We need some sort of barrier between the stores. + * + * In the vmalloc() case, this is provided by a smp_wmb() in + * clear_vm_uninitialized_flag(). In the per-cpu allocator and in + * get_vm_area() and friends, the caller gets shadow allocated but + * doesn't have any pages mapped into the virtual address space that + * has been reserved. Mapping those pages in will involve taking and + * releasing a page-table lock, which will provide the barrier. + */ + + return 0; +} + +static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + + page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + + spin_lock(&init_mm.page_table_lock); + + if (likely(!pte_none(*ptep))) { + pte_clear(&init_mm, addr, ptep); + free_page(page); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} + +/* + * Release the backing for the vmalloc region [start, end), which + * lies within the free region [free_region_start, free_region_end). + * + * This can be run lazily, long after the region was freed. It runs + * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap + * infrastructure. + * + * How does this work? + * ------------------- + * + * We have a region that is page aligned, labeled as A. + * That might not map onto the shadow in a way that is page-aligned: + * + * start end + * v v + * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |??AAAAAA|AAAAAAAA|AA??????| < shadow + * (1) (2) (3) + * + * First we align the start upwards and the end downwards, so that the + * shadow of the region aligns with shadow page boundaries. In the + * example, this gives us the shadow page (2). This is the shadow entirely + * covered by this allocation. + * + * Then we have the tricky bits. We want to know if we can free the + * partially covered shadow pages - (1) and (3) in the example. For this, + * we are given the start and end of the free region that contains this + * allocation. Extending our previous example, we could have: + * + * free_region_start free_region_end + * | start end | + * v v v v + * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow + * (1) (2) (3) + * + * Once again, we align the start of the free region up, and the end of + * the free region down so that the shadow is page aligned. So we can free + * page (1) - we know no allocation currently uses anything in that page, + * because all of it is in the vmalloc free region. But we cannot free + * page (3), because we can't be sure that the rest of it is unused. + * + * We only consider pages that contain part of the original region for + * freeing: we don't try to free other pages from the free region or we'd + * end up trying to free huge chunks of virtual address space. + * + * Concurrency + * ----------- + * + * How do we know that we're not freeing a page that is simultaneously + * being used for a fresh allocation in kasan_populate_vmalloc(_pte)? + * + * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running + * at the same time. While we run under free_vmap_area_lock, the population + * code does not. + * + * free_vmap_area_lock instead operates to ensure that the larger range + * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and + * the per-cpu region-finding algorithm both run under free_vmap_area_lock, + * no space identified as free will become used while we are running. This + * means that so long as we are careful with alignment and only free shadow + * pages entirely covered by the free region, we will not run in to any + * trouble - any simultaneous allocations will be for disjoint regions. + */ +void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) +{ + void *shadow_start, *shadow_end; + unsigned long region_start, region_end; + unsigned long size; + + if (!kasan_arch_is_ready()) + return; + + region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); + region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); + + free_region_start = ALIGN(free_region_start, KASAN_MEMORY_PER_SHADOW_PAGE); + + if (start != region_start && + free_region_start < region_start) + region_start -= KASAN_MEMORY_PER_SHADOW_PAGE; + + free_region_end = ALIGN_DOWN(free_region_end, KASAN_MEMORY_PER_SHADOW_PAGE); + + if (end != region_end && + free_region_end > region_end) + region_end += KASAN_MEMORY_PER_SHADOW_PAGE; + + shadow_start = kasan_mem_to_shadow((void *)region_start); + shadow_end = kasan_mem_to_shadow((void *)region_end); + + if (shadow_end > shadow_start) { + size = shadow_end - shadow_start; + if (IS_ENABLED(CONFIG_UML)) { + __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start); + return; + } + apply_to_existing_page_range(&init_mm, + (unsigned long)shadow_start, + size, kasan_depopulate_vmalloc_pte, + NULL); + flush_tlb_kernel_range((unsigned long)shadow_start, + (unsigned long)shadow_end); + } +} + +void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, + kasan_vmalloc_flags_t flags) +{ + /* + * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC + * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored. + * Software KASAN modes can't optimize zeroing memory by combining it + * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. + */ + + if (!kasan_arch_is_ready()) + return (void *)start; + + if (!is_vmalloc_or_module_addr(start)) + return (void *)start; + + /* + * Don't tag executable memory with the tag-based mode. + * The kernel doesn't tolerate having the PC register tagged. + */ + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) && + !(flags & KASAN_VMALLOC_PROT_NORMAL)) + return (void *)start; + + start = set_tag(start, kasan_random_tag()); + kasan_unpoison(start, size, false); + return (void *)start; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void __kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!kasan_arch_is_ready()) + return; + + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + kasan_poison(start, size, KASAN_VMALLOC_INVALID, false); +} + +#else /* CONFIG_KASAN_VMALLOC */ + +int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask) +{ + void *ret; + size_t scaled_size; + size_t shadow_size; + unsigned long shadow_start; + + shadow_start = (unsigned long)kasan_mem_to_shadow(addr); + scaled_size = (size + KASAN_GRANULE_SIZE - 1) >> + KASAN_SHADOW_SCALE_SHIFT; + shadow_size = round_up(scaled_size, PAGE_SIZE); + + if (WARN_ON(!PAGE_ALIGNED(shadow_start))) + return -EINVAL; + + if (IS_ENABLED(CONFIG_UML)) { + __memset((void *)shadow_start, KASAN_SHADOW_INIT, shadow_size); + return 0; + } + + ret = __vmalloc_node_range(shadow_size, 1, shadow_start, + shadow_start + shadow_size, + GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, + __builtin_return_address(0)); + + if (ret) { + struct vm_struct *vm = find_vm_area(addr); + __memset(ret, KASAN_SHADOW_INIT, shadow_size); + vm->flags |= VM_KASAN; + kmemleak_ignore(ret); + + if (vm->flags & VM_DEFER_KMEMLEAK) + kmemleak_vmalloc(vm, size, gfp_mask); + + return 0; + } + + return -ENOMEM; +} + +void kasan_free_module_shadow(const struct vm_struct *vm) +{ + if (IS_ENABLED(CONFIG_UML)) + return; + + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); +} + +#endif diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c new file mode 100644 index 000000000..a3afaf2ad --- /dev/null +++ b/mm/kasan/sw_tags.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core software tag-based KASAN code. + * + * Copyright (c) 2018 Google, Inc. + * Author: Andrey Konovalov + */ + +#define pr_fmt(fmt) "kasan: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" +#include "../slab.h" + +static DEFINE_PER_CPU(u32, prng_state); + +void __init kasan_init_sw_tags(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(prng_state, cpu) = (u32)get_cycles(); + + kasan_init_tags(); + + pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", + kasan_stack_collection_enabled() ? "on" : "off"); +} + +/* + * If a preemption happens between this_cpu_read and this_cpu_write, the only + * side effect is that we'll give a few allocated in different contexts objects + * the same tag. Since tag-based KASAN is meant to be used a probabilistic + * bug-detection debug feature, this doesn't have significant negative impact. + * + * Ideally the tags use strong randomness to prevent any attempts to predict + * them during explicit exploit attempts. But strong randomness is expensive, + * and we did an intentional trade-off to use a PRNG. This non-atomic RMW + * sequence has in fact positive effect, since interrupts that randomly skew + * PRNG at unpredictable points do only good. + */ +u8 kasan_random_tag(void) +{ + u32 state = this_cpu_read(prng_state); + + state = 1664525 * state + 1013904223; + this_cpu_write(prng_state, state); + + return (u8)(state % (KASAN_TAG_MAX + 1)); +} + +bool kasan_check_range(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) +{ + u8 tag; + u8 *shadow_first, *shadow_last, *shadow; + void *untagged_addr; + + if (unlikely(size == 0)) + return true; + + if (unlikely(addr + size < addr)) + return !kasan_report(addr, size, write, ret_ip); + + tag = get_tag((const void *)addr); + + /* + * Ignore accesses for pointers tagged with 0xff (native kernel + * pointer tag) to suppress false positives caused by kmap. + * + * Some kernel code was written to account for archs that don't keep + * high memory mapped all the time, but rather map and unmap particular + * pages when needed. Instead of storing a pointer to the kernel memory, + * this code saves the address of the page structure and offset within + * that page for later use. Those pages are then mapped and unmapped + * with kmap/kunmap when necessary and virt_to_page is used to get the + * virtual address of the page. For arm64 (that keeps the high memory + * mapped all the time), kmap is turned into a page_address call. + + * The issue is that with use of the page_address + virt_to_page + * sequence the top byte value of the original pointer gets lost (gets + * set to KASAN_TAG_KERNEL (0xFF)). + */ + if (tag == KASAN_TAG_KERNEL) + return true; + + untagged_addr = kasan_reset_tag((const void *)addr); + if (unlikely(untagged_addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + return !kasan_report(addr, size, write, ret_ip); + } + shadow_first = kasan_mem_to_shadow(untagged_addr); + shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1); + for (shadow = shadow_first; shadow <= shadow_last; shadow++) { + if (*shadow != tag) { + return !kasan_report(addr, size, write, ret_ip); + } + } + + return true; +} + +bool kasan_byte_accessible(const void *addr) +{ + u8 tag = get_tag(addr); + void *untagged_addr = kasan_reset_tag(addr); + u8 shadow_byte; + + if (untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) + return false; + + shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(untagged_addr)); + return tag == KASAN_TAG_KERNEL || tag == shadow_byte; +} + +#define DEFINE_HWASAN_LOAD_STORE(size) \ + void __hwasan_load##size##_noabort(unsigned long addr) \ + { \ + kasan_check_range(addr, size, false, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ + void __hwasan_store##size##_noabort(unsigned long addr) \ + { \ + kasan_check_range(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__hwasan_store##size##_noabort) + +DEFINE_HWASAN_LOAD_STORE(1); +DEFINE_HWASAN_LOAD_STORE(2); +DEFINE_HWASAN_LOAD_STORE(4); +DEFINE_HWASAN_LOAD_STORE(8); +DEFINE_HWASAN_LOAD_STORE(16); + +void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) +{ + kasan_check_range(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__hwasan_loadN_noabort); + +void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) +{ + kasan_check_range(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__hwasan_storeN_noabort); + +void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) +{ + kasan_poison((void *)addr, size, tag, false); +} +EXPORT_SYMBOL(__hwasan_tag_memory); + +void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, + unsigned long ret_ip) +{ + kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10, + ret_ip); +} diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c new file mode 100644 index 000000000..67a222586 --- /dev/null +++ b/mm/kasan/tags.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains common tag-based KASAN code. + * + * Copyright (c) 2018 Google, Inc. + * Copyright (c) 2020 Google, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kasan.h" +#include "../slab.h" + +#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10) + +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +/* Non-zero, as initial pointer values are 0. */ +#define STACK_RING_BUSY_PTR ((void *)1) + +struct kasan_stack_ring stack_ring = { + .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) +}; + +/* kasan.stacktrace=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); + +/* kasan.stack_ring_size= */ +static int __init early_kasan_flag_stack_ring_size(char *arg) +{ + if (!arg) + return -EINVAL; + + return kstrtoul(arg, 0, &stack_ring.size); +} +early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); + +void __init kasan_init_tags(void) +{ + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default is specified by kasan_flag_stacktrace definition. */ + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; + } + + if (kasan_stack_collection_enabled()) { + if (!stack_ring.size) + stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT; + stack_ring.entries = memblock_alloc( + sizeof(stack_ring.entries[0]) * stack_ring.size, + SMP_CACHE_BYTES); + if (WARN_ON(!stack_ring.entries)) + static_branch_disable(&kasan_flag_stacktrace); + } +} + +static void save_stack_info(struct kmem_cache *cache, void *object, + gfp_t gfp_flags, bool is_free) +{ + unsigned long flags; + depot_stack_handle_t stack; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *old_ptr; + + stack = kasan_save_stack(gfp_flags, true); + + /* + * Prevent save_stack_info() from modifying stack ring + * when kasan_complete_mode_report_info() is walking it. + */ + read_lock_irqsave(&stack_ring.lock, flags); + +next: + pos = atomic64_fetch_add(1, &stack_ring.pos); + entry = &stack_ring.entries[pos % stack_ring.size]; + + /* Detect stack ring entry slots that are being written to. */ + old_ptr = READ_ONCE(entry->ptr); + if (old_ptr == STACK_RING_BUSY_PTR) + goto next; /* Busy slot. */ + if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) + goto next; /* Busy slot. */ + + WRITE_ONCE(entry->size, cache->object_size); + WRITE_ONCE(entry->pid, current->pid); + WRITE_ONCE(entry->stack, stack); + WRITE_ONCE(entry->is_free, is_free); + + /* + * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). + */ + smp_store_release(&entry->ptr, (s64)object); + + read_unlock_irqrestore(&stack_ring.lock, flags); +} + +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + save_stack_info(cache, object, flags, false); +} + +void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + save_stack_info(cache, object, GFP_NOWAIT, true); +} diff --git a/mm/kfence/.kunitconfig b/mm/kfence/.kunitconfig new file mode 100644 index 000000000..f3d65e939 --- /dev/null +++ b/mm/kfence/.kunitconfig @@ -0,0 +1,6 @@ +CONFIG_KUNIT=y +CONFIG_KFENCE=y +CONFIG_KFENCE_KUNIT_TEST=y + +# Additional dependencies. +CONFIG_FTRACE=y diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile new file mode 100644 index 000000000..2de2a58d1 --- /dev/null +++ b/mm/kfence/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y := core.o report.o + +CFLAGS_kfence_test.o := -fno-omit-frame-pointer -fno-optimize-sibling-calls +obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o diff --git a/mm/kfence/core.c b/mm/kfence/core.c new file mode 100644 index 000000000..c597cfebb --- /dev/null +++ b/mm/kfence/core.c @@ -0,0 +1,1164 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE guarded object allocator and fault handling. + * + * Copyright (C) 2020, Google LLC. + */ + +#define pr_fmt(fmt) "kfence: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* Disables KFENCE on the first warning assuming an irrecoverable error. */ +#define KFENCE_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kfence_enabled, false); \ + disabled_by_warn = true; \ + } \ + __cond; \ + }) + +/* === Data ================================================================= */ + +static bool kfence_enabled __read_mostly; +static bool disabled_by_warn __read_mostly; + +unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL; +EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */ + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kfence." + +static int kfence_enable_late(void); +static int param_set_sample_interval(const char *val, const struct kernel_param *kp) +{ + unsigned long num; + int ret = kstrtoul(val, 0, &num); + + if (ret < 0) + return ret; + + /* Using 0 to indicate KFENCE is disabled. */ + if (!num && READ_ONCE(kfence_enabled)) { + pr_info("disabled\n"); + WRITE_ONCE(kfence_enabled, false); + } + + *((unsigned long *)kp->arg) = num; + + if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING) + return disabled_by_warn ? -EINVAL : kfence_enable_late(); + return 0; +} + +static int param_get_sample_interval(char *buffer, const struct kernel_param *kp) +{ + if (!READ_ONCE(kfence_enabled)) + return sprintf(buffer, "0\n"); + + return param_get_ulong(buffer, kp); +} + +static const struct kernel_param_ops sample_interval_param_ops = { + .set = param_set_sample_interval, + .get = param_get_sample_interval, +}; +module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600); + +/* Pool usage% threshold when currently covered allocations are skipped. */ +static unsigned long kfence_skip_covered_thresh __read_mostly = 75; +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644); + +/* If true, use a deferrable timer. */ +static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE); +module_param_named(deferrable, kfence_deferrable, bool, 0444); + +/* If true, check all canary bytes on panic. */ +static bool kfence_check_on_panic __read_mostly; +module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444); + +/* The pool of pages used for guard pages and objects. */ +char *__kfence_pool __read_mostly; +EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ + +/* + * Per-object metadata, with one-to-one mapping of object metadata to + * backing pages (in __kfence_pool). + */ +static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); +struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +/* Freelist with available objects. */ +static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); +static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */ + +/* + * The static key to set up a KFENCE allocation; or if static keys are not used + * to gate allocations, to avoid a load and compare if KFENCE is disabled. + */ +DEFINE_STATIC_KEY_FALSE(kfence_allocation_key); + +/* Gates the allocation, ensuring only one succeeds in a given period. */ +atomic_t kfence_allocation_gate = ATOMIC_INIT(1); + +/* + * A Counting Bloom filter of allocation coverage: limits currently covered + * allocations of the same source filling up the pool. + * + * Assuming a range of 15%-85% unique allocations in the pool at any point in + * time, the below parameters provide a probablity of 0.02-0.33 for false + * positive hits respectively: + * + * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM + */ +#define ALLOC_COVERED_HNUM 2 +#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2) +#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER) +#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER) +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1) +static atomic_t alloc_covered[ALLOC_COVERED_SIZE]; + +/* Stack depth used to determine uniqueness of an allocation. */ +#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8) + +/* + * Randomness for stack hashes, making the same collisions across reboots and + * different machines less likely. + */ +static u32 stack_hash_seed __ro_after_init; + +/* Statistics counters for debugfs. */ +enum kfence_counter_id { + KFENCE_COUNTER_ALLOCATED, + KFENCE_COUNTER_ALLOCS, + KFENCE_COUNTER_FREES, + KFENCE_COUNTER_ZOMBIES, + KFENCE_COUNTER_BUGS, + KFENCE_COUNTER_SKIP_INCOMPAT, + KFENCE_COUNTER_SKIP_CAPACITY, + KFENCE_COUNTER_SKIP_COVERED, + KFENCE_COUNTER_COUNT, +}; +static atomic_long_t counters[KFENCE_COUNTER_COUNT]; +static const char *const counter_names[] = { + [KFENCE_COUNTER_ALLOCATED] = "currently allocated", + [KFENCE_COUNTER_ALLOCS] = "total allocations", + [KFENCE_COUNTER_FREES] = "total frees", + [KFENCE_COUNTER_ZOMBIES] = "zombie allocations", + [KFENCE_COUNTER_BUGS] = "total bugs", + [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)", + [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)", + [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)", +}; +static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT); + +/* === Internals ============================================================ */ + +static inline bool should_skip_covered(void) +{ + unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100; + + return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh; +} + +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries) +{ + num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH); + num_entries = filter_irq_stacks(stack_entries, num_entries); + return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed); +} + +/* + * Adds (or subtracts) count @val for allocation stack trace hash + * @alloc_stack_hash from Counting Bloom filter. + */ +static void alloc_covered_add(u32 alloc_stack_hash, int val) +{ + int i; + + for (i = 0; i < ALLOC_COVERED_HNUM; i++) { + atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]); + alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); + } +} + +/* + * Returns true if the allocation stack trace hash @alloc_stack_hash is + * currently contained (non-zero count) in Counting Bloom filter. + */ +static bool alloc_covered_contains(u32 alloc_stack_hash) +{ + int i; + + for (i = 0; i < ALLOC_COVERED_HNUM; i++) { + if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK])) + return false; + alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash); + } + + return true; +} + +static bool kfence_protect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true)); +} + +static bool kfence_unprotect(unsigned long addr) +{ + return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false)); +} + +static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta) +{ + unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2; + unsigned long pageaddr = (unsigned long)&__kfence_pool[offset]; + + /* The checks do not affect performance; only called from slow-paths. */ + + /* Only call with a pointer into kfence_metadata. */ + if (KFENCE_WARN_ON(meta < kfence_metadata || + meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS)) + return 0; + + /* + * This metadata object only ever maps to 1 page; verify that the stored + * address is in the expected range. + */ + if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr)) + return 0; + + return pageaddr; +} + +/* + * Update the object's metadata state, including updating the alloc/free stacks + * depending on the state transition. + */ +static noinline void +metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next, + unsigned long *stack_entries, size_t num_stack_entries) +{ + struct kfence_track *track = + next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track; + + lockdep_assert_held(&meta->lock); + + if (stack_entries) { + memcpy(track->stack_entries, stack_entries, + num_stack_entries * sizeof(stack_entries[0])); + } else { + /* + * Skip over 1 (this) functions; noinline ensures we do not + * accidentally skip over the caller by never inlining. + */ + num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1); + } + track->num_stack_entries = num_stack_entries; + track->pid = task_pid_nr(current); + track->cpu = raw_smp_processor_id(); + track->ts_nsec = local_clock(); /* Same source as printk timestamps. */ + + /* + * Pairs with READ_ONCE() in + * kfence_shutdown_cache(), + * kfence_handle_page_fault(). + */ + WRITE_ONCE(meta->state, next); +} + +/* Write canary byte to @addr. */ +static inline bool set_canary_byte(u8 *addr) +{ + *addr = KFENCE_CANARY_PATTERN(addr); + return true; +} + +/* Check canary byte at @addr. */ +static inline bool check_canary_byte(u8 *addr) +{ + struct kfence_metadata *meta; + unsigned long flags; + + if (likely(*addr == KFENCE_CANARY_PATTERN(addr))) + return true; + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + + meta = addr_to_metadata((unsigned long)addr); + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION); + raw_spin_unlock_irqrestore(&meta->lock, flags); + + return false; +} + +/* __always_inline this to ensure we won't do an indirect call to fn. */ +static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *)) +{ + const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE); + unsigned long addr; + + /* + * We'll iterate over each canary byte per-side until fn() returns + * false. However, we'll still iterate over the canary bytes to the + * right of the object even if there was an error in the canary bytes to + * the left of the object. Specifically, if check_canary_byte() + * generates an error, showing both sides might give more clues as to + * what the error is about when displaying which bytes were corrupted. + */ + + /* Apply to left of object. */ + for (addr = pageaddr; addr < meta->addr; addr++) { + if (!fn((u8 *)addr)) + break; + } + + /* Apply to right of object. */ + for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) { + if (!fn((u8 *)addr)) + break; + } +} + +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, + unsigned long *stack_entries, size_t num_stack_entries, + u32 alloc_stack_hash) +{ + struct kfence_metadata *meta = NULL; + unsigned long flags; + struct slab *slab; + void *addr; + const bool random_right_allocate = prandom_u32_max(2); + const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && + !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS); + + /* Try to obtain a free object. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + if (!list_empty(&kfence_freelist)) { + meta = list_entry(kfence_freelist.next, struct kfence_metadata, list); + list_del_init(&meta->list); + } + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + if (!meta) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]); + return NULL; + } + + if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) { + /* + * This is extremely unlikely -- we are reporting on a + * use-after-free, which locked meta->lock, and the reporting + * code via printk calls kmalloc() which ends up in + * kfence_alloc() and tries to grab the same object that we're + * reporting on. While it has never been observed, lockdep does + * report that there is a possibility of deadlock. Fix it by + * using trylock and bailing out gracefully. + */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + /* Put the object back on the freelist. */ + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + return NULL; + } + + meta->addr = metadata_to_pageaddr(meta); + /* Unprotect if we're reusing this page. */ + if (meta->state == KFENCE_OBJECT_FREED) + kfence_unprotect(meta->addr); + + /* + * Note: for allocations made before RNG initialization, will always + * return zero. We still benefit from enabling KFENCE as early as + * possible, even when the RNG is not yet available, as this will allow + * KFENCE to detect bugs due to earlier allocations. The only downside + * is that the out-of-bounds accesses detected are deterministic for + * such allocations. + */ + if (random_right_allocate) { + /* Allocate on the "right" side, re-calculate address. */ + meta->addr += PAGE_SIZE - size; + meta->addr = ALIGN_DOWN(meta->addr, cache->align); + } + + addr = (void *)meta->addr; + + /* Update remaining metadata. */ + metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries); + /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */ + WRITE_ONCE(meta->cache, cache); + meta->size = size; + meta->alloc_stack_hash = alloc_stack_hash; + raw_spin_unlock_irqrestore(&meta->lock, flags); + + alloc_covered_add(alloc_stack_hash, 1); + + /* Set required slab fields. */ + slab = virt_to_slab((void *)meta->addr); + slab->slab_cache = cache; +#if defined(CONFIG_SLUB) + slab->objects = 1; +#elif defined(CONFIG_SLAB) + slab->s_mem = addr; +#endif + + /* Memory initialization. */ + for_each_canary(meta, set_canary_byte); + + /* + * We check slab_want_init_on_alloc() ourselves, rather than letting + * SL*B do the initialization, as otherwise we might overwrite KFENCE's + * redzone. + */ + if (unlikely(slab_want_init_on_alloc(gfp, cache))) + memzero_explicit(addr, size); + if (cache->ctor) + cache->ctor(addr); + + if (random_fault) + kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]); + + return addr; +} + +static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie) +{ + struct kcsan_scoped_access assert_page_exclusive; + unsigned long flags; + bool init; + + raw_spin_lock_irqsave(&meta->lock, flags); + + if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) { + /* Invalid or double-free, bail out. */ + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + kfence_report_error((unsigned long)addr, false, NULL, meta, + KFENCE_ERROR_INVALID_FREE); + raw_spin_unlock_irqrestore(&meta->lock, flags); + return; + } + + /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */ + kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE, + KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT, + &assert_page_exclusive); + + if (CONFIG_KFENCE_STRESS_TEST_FAULTS) + kfence_unprotect((unsigned long)addr); /* To check canary bytes. */ + + /* Restore page protection if there was an OOB access. */ + if (meta->unprotected_page) { + memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); + kfence_protect(meta->unprotected_page); + meta->unprotected_page = 0; + } + + /* Mark the object as freed. */ + metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0); + init = slab_want_init_on_free(meta->cache); + raw_spin_unlock_irqrestore(&meta->lock, flags); + + alloc_covered_add(meta->alloc_stack_hash, -1); + + /* Check canary bytes for memory corruption. */ + for_each_canary(meta, check_canary_byte); + + /* + * Clear memory if init-on-free is set. While we protect the page, the + * data is still there, and after a use-after-free is detected, we + * unprotect the page, so the data is still accessible. + */ + if (!zombie && unlikely(init)) + memzero_explicit(addr, meta->size); + + /* Protect to detect use-after-frees. */ + kfence_protect((unsigned long)addr); + + kcsan_end_scoped_access(&assert_page_exclusive); + if (!zombie) { + /* Add it to the tail of the freelist for reuse. */ + raw_spin_lock_irqsave(&kfence_freelist_lock, flags); + KFENCE_WARN_ON(!list_empty(&meta->list)); + list_add_tail(&meta->list, &kfence_freelist); + raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags); + + atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]); + atomic_long_inc(&counters[KFENCE_COUNTER_FREES]); + } else { + /* See kfence_shutdown_cache(). */ + atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]); + } +} + +static void rcu_guarded_free(struct rcu_head *h) +{ + struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head); + + kfence_guarded_free((void *)meta->addr, meta, false); +} + +/* + * Initialization of the KFENCE pool after its allocation. + * Returns 0 on success; otherwise returns the address up to + * which partial initialization succeeded. + */ +static unsigned long kfence_init_pool(void) +{ + unsigned long addr = (unsigned long)__kfence_pool; + struct page *pages; + int i; + + if (!arch_kfence_init_pool()) + return addr; + + pages = virt_to_page(__kfence_pool); + + /* + * Set up object pages: they must have PG_slab set, to avoid freeing + * these as real pages. + * + * We also want to avoid inserting kfence_free() in the kfree() + * fast-path in SLUB, and therefore need to ensure kfree() correctly + * enters __slab_free() slow-path. + */ + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + struct slab *slab = page_slab(nth_page(pages, i)); + + if (!i || (i % 2)) + continue; + + __folio_set_slab(slab_folio(slab)); +#ifdef CONFIG_MEMCG + slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | + MEMCG_DATA_OBJCGS; +#endif + } + + /* + * Protect the first 2 pages. The first page is mostly unnecessary, and + * merely serves as an extended guard page. However, adding one + * additional page in the beginning gives us an even number of pages, + * which simplifies the mapping of address to metadata index. + */ + for (i = 0; i < 2; i++) { + if (unlikely(!kfence_protect(addr))) + return addr; + + addr += PAGE_SIZE; + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta = &kfence_metadata[i]; + + /* Initialize metadata. */ + INIT_LIST_HEAD(&meta->list); + raw_spin_lock_init(&meta->lock); + meta->state = KFENCE_OBJECT_UNUSED; + meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ + list_add_tail(&meta->list, &kfence_freelist); + + /* Protect the right redzone. */ + if (unlikely(!kfence_protect(addr + PAGE_SIZE))) + goto reset_slab; + + addr += 2 * PAGE_SIZE; + } + + return 0; + +reset_slab: + for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { + struct slab *slab = page_slab(nth_page(pages, i)); + + if (!i || (i % 2)) + continue; +#ifdef CONFIG_MEMCG + slab->memcg_data = 0; +#endif + __folio_clear_slab(slab_folio(slab)); + } + + return addr; +} + +static bool __init kfence_init_pool_early(void) +{ + unsigned long addr; + + if (!__kfence_pool) + return false; + + addr = kfence_init_pool(); + + if (!addr) { + /* + * The pool is live and will never be deallocated from this point on. + * Ignore the pool object from the kmemleak phys object tree, as it would + * otherwise overlap with allocations returned by kfence_alloc(), which + * are registered with kmemleak through the slab post-alloc hook. + */ + kmemleak_ignore_phys(__pa(__kfence_pool)); + return true; + } + + /* + * Only release unprotected pages, and do not try to go back and change + * page attributes due to risk of failing to do so as well. If changing + * page attributes for some pages fails, it is very likely that it also + * fails for the first page, and therefore expect addr==__kfence_pool in + * most failure cases. + */ + memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); + __kfence_pool = NULL; + return false; +} + +static bool kfence_init_pool_late(void) +{ + unsigned long addr, free_size; + + addr = kfence_init_pool(); + + if (!addr) + return true; + + /* Same as above. */ + free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); +#ifdef CONFIG_CONTIG_ALLOC + free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); +#else + free_pages_exact((void *)addr, free_size); +#endif + __kfence_pool = NULL; + return false; +} + +/* === DebugFS Interface ==================================================== */ + +static int stats_show(struct seq_file *seq, void *v) +{ + int i; + + seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled)); + for (i = 0; i < KFENCE_COUNTER_COUNT; i++) + seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i])); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(stats); + +/* + * debugfs seq_file operations for /sys/kernel/debug/kfence/objects. + * start_object() and next_object() return the object index + 1, because NULL is used + * to stop iteration. + */ +static void *start_object(struct seq_file *seq, loff_t *pos) +{ + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static void stop_object(struct seq_file *seq, void *v) +{ +} + +static void *next_object(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + if (*pos < CONFIG_KFENCE_NUM_OBJECTS) + return (void *)((long)*pos + 1); + return NULL; +} + +static int show_object(struct seq_file *seq, void *v) +{ + struct kfence_metadata *meta = &kfence_metadata[(long)v - 1]; + unsigned long flags; + + raw_spin_lock_irqsave(&meta->lock, flags); + kfence_print_object(seq, meta); + raw_spin_unlock_irqrestore(&meta->lock, flags); + seq_puts(seq, "---------------------------------\n"); + + return 0; +} + +static const struct seq_operations objects_sops = { + .start = start_object, + .next = next_object, + .stop = stop_object, + .show = show_object, +}; +DEFINE_SEQ_ATTRIBUTE(objects); + +static int kfence_debugfs_init(void) +{ + struct dentry *kfence_dir; + + if (!READ_ONCE(kfence_enabled)) + return 0; + + kfence_dir = debugfs_create_dir("kfence", NULL); + debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops); + debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops); + return 0; +} + +late_initcall(kfence_debugfs_init); + +/* === Panic Notifier ====================================================== */ + +static void kfence_check_all_canary(void) +{ + int i; + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta = &kfence_metadata[i]; + + if (meta->state == KFENCE_OBJECT_ALLOCATED) + for_each_canary(meta, check_canary_byte); + } +} + +static int kfence_check_canary_callback(struct notifier_block *nb, + unsigned long reason, void *arg) +{ + kfence_check_all_canary(); + return NOTIFY_OK; +} + +static struct notifier_block kfence_check_canary_notifier = { + .notifier_call = kfence_check_canary_callback, +}; + +/* === Allocation Gate Timer ================================================ */ + +static struct delayed_work kfence_timer; + +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + +static void wake_up_kfence_timer(struct irq_work *work) +{ + wake_up(&allocation_wait); +} +static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); +#endif + +/* + * Set up delayed work, which will enable and disable the static key. We need to + * use a work queue (rather than a simple timer), since enabling and disabling a + * static key cannot be done from an interrupt. + * + * Note: Toggling a static branch currently causes IPIs, and here we'll end up + * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with + * more aggressive sampling intervals), we could get away with a variant that + * avoids IPIs, at the cost of not immediately capturing allocations if the + * instructions remain cached. + */ +static void toggle_allocation_gate(struct work_struct *work) +{ + if (!READ_ONCE(kfence_enabled)) + return; + + atomic_set(&kfence_allocation_gate, 0); +#ifdef CONFIG_KFENCE_STATIC_KEYS + /* Enable static key, and await allocation to happen. */ + static_branch_enable(&kfence_allocation_key); + + if (sysctl_hung_task_timeout_secs) { + /* + * During low activity with no allocations we might wait a + * while; let's avoid the hung task warning. + */ + wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), + sysctl_hung_task_timeout_secs * HZ / 2); + } else { + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); + } + + /* Disable static key and reset timer. */ + static_branch_disable(&kfence_allocation_key); +#endif + queue_delayed_work(system_unbound_wq, &kfence_timer, + msecs_to_jiffies(kfence_sample_interval)); +} + +/* === Public interface ===================================================== */ + +void __init kfence_alloc_pool(void) +{ + if (!kfence_sample_interval) + return; + + /* if the pool has already been initialized by arch, skip the below. */ + if (__kfence_pool) + return; + + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) + pr_err("failed to allocate pool\n"); +} + +static void kfence_init_enable(void) +{ + if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS)) + static_branch_enable(&kfence_allocation_key); + + if (kfence_deferrable) + INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate); + else + INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate); + + if (kfence_check_on_panic) + atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier); + + WRITE_ONCE(kfence_enabled, true); + queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + + pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, + CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, + (void *)(__kfence_pool + KFENCE_POOL_SIZE)); +} + +void __init kfence_init(void) +{ + stack_hash_seed = get_random_u32(); + + /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */ + if (!kfence_sample_interval) + return; + + if (!kfence_init_pool_early()) { + pr_err("%s failed\n", __func__); + return; + } + + kfence_init_enable(); +} + +static int kfence_init_late(void) +{ + const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE; +#ifdef CONFIG_CONTIG_ALLOC + struct page *pages; + + pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL); + if (!pages) + return -ENOMEM; + __kfence_pool = page_to_virt(pages); +#else + if (nr_pages > MAX_ORDER_NR_PAGES) { + pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); + return -EINVAL; + } + __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); + if (!__kfence_pool) + return -ENOMEM; +#endif + + if (!kfence_init_pool_late()) { + pr_err("%s failed\n", __func__); + return -EBUSY; + } + + kfence_init_enable(); + kfence_debugfs_init(); + + return 0; +} + +static int kfence_enable_late(void) +{ + if (!__kfence_pool) + return kfence_init_late(); + + WRITE_ONCE(kfence_enabled, true); + queue_delayed_work(system_unbound_wq, &kfence_timer, 0); + pr_info("re-enabled\n"); + return 0; +} + +void kfence_shutdown_cache(struct kmem_cache *s) +{ + unsigned long flags; + struct kfence_metadata *meta; + int i; + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + bool in_use; + + meta = &kfence_metadata[i]; + + /* + * If we observe some inconsistent cache and state pair where we + * should have returned false here, cache destruction is racing + * with either kmem_cache_alloc() or kmem_cache_free(). Taking + * the lock will not help, as different critical section + * serialization will have the same outcome. + */ + if (READ_ONCE(meta->cache) != s || + READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED; + raw_spin_unlock_irqrestore(&meta->lock, flags); + + if (in_use) { + /* + * This cache still has allocations, and we should not + * release them back into the freelist so they can still + * safely be used and retain the kernel's default + * behaviour of keeping the allocations alive (leak the + * cache); however, they effectively become "zombie + * allocations" as the KFENCE objects are the only ones + * still in use and the owning cache is being destroyed. + * + * We mark them freed, so that any subsequent use shows + * more useful error messages that will include stack + * traces of the user of the object, the original + * allocation, and caller to shutdown_cache(). + */ + kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true); + } + } + + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + meta = &kfence_metadata[i]; + + /* See above. */ + if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED) + continue; + + raw_spin_lock_irqsave(&meta->lock, flags); + if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED) + meta->cache = NULL; + raw_spin_unlock_irqrestore(&meta->lock, flags); + } +} + +void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) +{ + unsigned long stack_entries[KFENCE_STACK_DEPTH]; + size_t num_stack_entries; + u32 alloc_stack_hash; + + /* + * Perform size check before switching kfence_allocation_gate, so that + * we don't disable KFENCE without making an allocation. + */ + if (size > PAGE_SIZE) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); + return NULL; + } + + /* + * Skip allocations from non-default zones, including DMA. We cannot + * guarantee that pages in the KFENCE pool will have the requested + * properties (e.g. reside in DMAable memory). + */ + if ((flags & GFP_ZONEMASK) || + (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]); + return NULL; + } + + /* + * Skip allocations for this slab, if KFENCE has been disabled for + * this slab. + */ + if (s->flags & SLAB_SKIP_KFENCE) + return NULL; + + if (atomic_inc_return(&kfence_allocation_gate) > 1) + return NULL; +#ifdef CONFIG_KFENCE_STATIC_KEYS + /* + * waitqueue_active() is fully ordered after the update of + * kfence_allocation_gate per atomic_inc_return(). + */ + if (waitqueue_active(&allocation_wait)) { + /* + * Calling wake_up() here may deadlock when allocations happen + * from within timer code. Use an irq_work to defer it. + */ + irq_work_queue(&wake_up_kfence_timer_work); + } +#endif + + if (!READ_ONCE(kfence_enabled)) + return NULL; + + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0); + + /* + * Do expensive check for coverage of allocation in slow-path after + * allocation_gate has already become non-zero, even though it might + * mean not making any allocation within a given sample interval. + * + * This ensures reasonable allocation coverage when the pool is almost + * full, including avoiding long-lived allocations of the same source + * filling up the pool (e.g. pagecache allocations). + */ + alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries); + if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) { + atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]); + return NULL; + } + + return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries, + alloc_stack_hash); +} + +size_t kfence_ksize(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? meta->size : 0; +} + +void *kfence_object_start(const void *addr) +{ + const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + + /* + * Read locklessly -- if there is a race with __kfence_alloc(), this is + * either a use-after-free or invalid access. + */ + return meta ? (void *)meta->addr : NULL; +} + +void __kfence_free(void *addr) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + +#ifdef CONFIG_MEMCG + KFENCE_WARN_ON(meta->objcg); +#endif + /* + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing + * the object, as the object page may be recycled for other-typed + * objects once it has been freed. meta->cache may be NULL if the cache + * was destroyed. + */ + if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) + call_rcu(&meta->rcu_head, rcu_guarded_free); + else + kfence_guarded_free(addr, meta, false); +} + +bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs) +{ + const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE; + struct kfence_metadata *to_report = NULL; + enum kfence_error_type error_type; + unsigned long flags; + + if (!is_kfence_address((void *)addr)) + return false; + + if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */ + return kfence_unprotect(addr); /* ... unprotect and proceed. */ + + atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]); + + if (page_index % 2) { + /* This is a redzone, report a buffer overflow. */ + struct kfence_metadata *meta; + int distance = 0; + + meta = addr_to_metadata(addr - PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + to_report = meta; + /* Data race ok; distance calculation approximate. */ + distance = addr - data_race(meta->addr + meta->size); + } + + meta = addr_to_metadata(addr + PAGE_SIZE); + if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) { + /* Data race ok; distance calculation approximate. */ + if (!to_report || distance > data_race(meta->addr) - addr) + to_report = meta; + } + + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + to_report->unprotected_page = addr; + error_type = KFENCE_ERROR_OOB; + + /* + * If the object was freed before we took the look we can still + * report this as an OOB -- the report will simply show the + * stacktrace of the free as well. + */ + } else { + to_report = addr_to_metadata(addr); + if (!to_report) + goto out; + + raw_spin_lock_irqsave(&to_report->lock, flags); + error_type = KFENCE_ERROR_UAF; + /* + * We may race with __kfence_alloc(), and it is possible that a + * freed object may be reallocated. We simply report this as a + * use-after-free, with the stack trace showing the place where + * the object was re-allocated. + */ + } + +out: + if (to_report) { + kfence_report_error(addr, is_write, regs, to_report, error_type); + raw_spin_unlock_irqrestore(&to_report->lock, flags); + } else { + /* This may be a UAF or OOB access, but we can't be sure. */ + kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID); + } + + return kfence_unprotect(addr); /* Unprotect and let access proceed. */ +} diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h new file mode 100644 index 000000000..600f2e243 --- /dev/null +++ b/mm/kfence/kfence.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Kernel Electric-Fence (KFENCE). For more info please see + * Documentation/dev-tools/kfence.rst. + * + * Copyright (C) 2020, Google LLC. + */ + +#ifndef MM_KFENCE_KFENCE_H +#define MM_KFENCE_KFENCE_H + +#include +#include +#include +#include + +#include "../slab.h" /* for struct kmem_cache */ + +/* + * Get the canary byte pattern for @addr. Use a pattern that varies based on the + * lower 3 bits of the address, to detect memory corruptions with higher + * probability, where similar constants are used. + */ +#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) + +/* Maximum stack depth for reports. */ +#define KFENCE_STACK_DEPTH 64 + +/* KFENCE object states. */ +enum kfence_object_state { + KFENCE_OBJECT_UNUSED, /* Object is unused. */ + KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ + KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ +}; + +/* Alloc/free tracking information. */ +struct kfence_track { + pid_t pid; + int cpu; + u64 ts_nsec; + int num_stack_entries; + unsigned long stack_entries[KFENCE_STACK_DEPTH]; +}; + +/* KFENCE metadata per guarded allocation. */ +struct kfence_metadata { + struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ + struct rcu_head rcu_head; /* For delayed freeing. */ + + /* + * Lock protecting below data; to ensure consistency of the below data, + * since the following may execute concurrently: __kfence_alloc(), + * __kfence_free(), kfence_handle_page_fault(). However, note that we + * cannot grab the same metadata off the freelist twice, and multiple + * __kfence_alloc() cannot run concurrently on the same metadata. + */ + raw_spinlock_t lock; + + /* The current state of the object; see above. */ + enum kfence_object_state state; + + /* + * Allocated object address; cannot be calculated from size, because of + * alignment requirements. + * + * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. + */ + unsigned long addr; + + /* + * The size of the original allocation. + */ + size_t size; + + /* + * The kmem_cache cache of the last allocation; NULL if never allocated + * or the cache has already been destroyed. + */ + struct kmem_cache *cache; + + /* + * In case of an invalid access, the page that was unprotected; we + * optimistically only store one address. + */ + unsigned long unprotected_page; + + /* Allocation and free stack information. */ + struct kfence_track alloc_track; + struct kfence_track free_track; + /* For updating alloc_covered on frees. */ + u32 alloc_stack_hash; +#ifdef CONFIG_MEMCG + struct obj_cgroup *objcg; +#endif +}; + +extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; + +static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) +{ + long index; + + /* The checks do not affect performance; only called from slow-paths. */ + + if (!is_kfence_address((void *)addr)) + return NULL; + + /* + * May be an invalid index if called with an address at the edge of + * __kfence_pool, in which case we would report an "invalid access" + * error. + */ + index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; + if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) + return NULL; + + return &kfence_metadata[index]; +} + +/* KFENCE error types for report generation. */ +enum kfence_error_type { + KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ + KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ + KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ + KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ + KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ +}; + +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type); + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); + +#endif /* MM_KFENCE_KFENCE_H */ diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c new file mode 100644 index 000000000..a97bffe0c --- /dev/null +++ b/mm/kfence/kfence_test.c @@ -0,0 +1,870 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KFENCE memory safety error detector. Since the interface with + * which KFENCE's reports are obtained is via the console, this is the output we + * should verify. For each test case checks the presence (or absence) of + * generated reports. Relies on 'console' tracepoint to capture reports as they + * appear in the kernel log. + * + * Copyright (C) 2020, Google LLC. + * Author: Alexander Potapenko + * Marco Elver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* May be overridden by . */ +#ifndef arch_kfence_test_address +#define arch_kfence_test_address(addr) (addr) +#endif + +#define KFENCE_TEST_REQUIRES(test, cond) do { \ + if (!(cond)) \ + kunit_skip((test), "Test requires: " #cond); \ +} while (0) + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + int nlines; + char lines[2][256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + int nlines; + + spin_lock_irqsave(&observed.lock, flags); + nlines = observed.nlines; + + if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) { + /* + * KFENCE report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0]))); + nlines = 1; + } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) { + strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0]))); + } + + WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */ + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines); +} + +/* Information we expect in a report. */ +struct expect_report { + enum kfence_error_type type; /* The type or error. */ + void *fn; /* Function pointer to expected function where access occurred. */ + char *addr; /* Address at which the bad access occurred. */ + bool is_write; /* Is access a write. */ +}; + +static const char *get_access_type(const struct expect_report *r) +{ + return r->is_write ? "write" : "read"; +} + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + unsigned long addr = (unsigned long)r->addr; + bool ret = false; + unsigned long flags; + typeof(observed.lines) expect; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available()) + return false; + + /* Generate expected report contents. */ + + /* Title */ + cur = expect[0]; + end = &expect[0][sizeof(expect[0]) - 1]; + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s", + get_access_type(r)); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s", + get_access_type(r)); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s", + get_access_type(r)); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free"); + break; + } + + scnprintf(cur, end - cur, " in %pS", r->fn); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expect[0], '+'); + if (cur) + *cur = '\0'; + + /* Access information */ + cur = expect[1]; + end = &expect[1][sizeof(expect[1]) - 1]; + + switch (r->type) { + case KFENCE_ERROR_OOB: + cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); + break; + case KFENCE_ERROR_UAF: + cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); + break; + case KFENCE_ERROR_CORRUPTION: + cur += scnprintf(cur, end - cur, "Corrupted memory at"); + break; + case KFENCE_ERROR_INVALID: + cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r)); + addr = arch_kfence_test_address(addr); + break; + case KFENCE_ERROR_INVALID_FREE: + cur += scnprintf(cur, end - cur, "Invalid free of"); + break; + } + + cur += scnprintf(cur, end - cur, " 0x%p", (void *)addr); + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]); +out: + spin_unlock_irqrestore(&observed.lock, flags); + return ret; +} + +/* ===== Test cases ===== */ + +#define TEST_PRIV_WANT_MEMCACHE ((void *)1) + +/* Cache used by tests; if NULL, allocate from kmalloc instead. */ +static struct kmem_cache *test_cache; + +static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags, + void (*ctor)(void *)) +{ + if (test->priv != TEST_PRIV_WANT_MEMCACHE) + return size; + + kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor); + + /* + * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any + * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to + * allocate via memcg, if enabled. + */ + flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT; + test_cache = kmem_cache_create("test", size, 1, flags, ctor); + KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache"); + + return size; +} + +static void test_cache_destroy(void) +{ + if (!test_cache) + return; + + kmem_cache_destroy(test_cache); + test_cache = NULL; +} + +static inline size_t kmalloc_cache_alignment(size_t size) +{ + return kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]->align; +} + +/* Must always inline to match stack trace against caller. */ +static __always_inline void test_free(void *ptr) +{ + if (test_cache) + kmem_cache_free(test_cache, ptr); + else + kfree(ptr); +} + +/* + * If this should be a KFENCE allocation, and on which side the allocation and + * the closest guard page should be. + */ +enum allocation_policy { + ALLOCATE_ANY, /* KFENCE, any side. */ + ALLOCATE_LEFT, /* KFENCE, left side of page. */ + ALLOCATE_RIGHT, /* KFENCE, right side of page. */ + ALLOCATE_NONE, /* No KFENCE allocation. */ +}; + +/* + * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the + * current test_cache if set up. + */ +static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy) +{ + void *alloc; + unsigned long timeout, resched_after; + const char *policy_name; + + switch (policy) { + case ALLOCATE_ANY: + policy_name = "any"; + break; + case ALLOCATE_LEFT: + policy_name = "left"; + break; + case ALLOCATE_RIGHT: + policy_name = "right"; + break; + case ALLOCATE_NONE: + policy_name = "none"; + break; + } + + kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp, + policy_name, !!test_cache); + + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); + /* + * Especially for non-preemption kernels, ensure the allocation-gate + * timer can catch up: after @resched_after, every failed allocation + * attempt yields, to ensure the allocation-gate timer is scheduled. + */ + resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval); + do { + if (test_cache) + alloc = kmem_cache_alloc(test_cache, gfp); + else + alloc = kmalloc(size, gfp); + + if (is_kfence_address(alloc)) { + struct slab *slab = virt_to_slab(alloc); + struct kmem_cache *s = test_cache ?: + kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)]; + + /* + * Verify that various helpers return the right values + * even for KFENCE objects; these are required so that + * memcg accounting works correctly. + */ + KUNIT_EXPECT_EQ(test, obj_to_index(s, slab, alloc), 0U); + KUNIT_EXPECT_EQ(test, objs_per_slab(s, slab), 1); + + if (policy == ALLOCATE_ANY) + return alloc; + if (policy == ALLOCATE_LEFT && PAGE_ALIGNED(alloc)) + return alloc; + if (policy == ALLOCATE_RIGHT && !PAGE_ALIGNED(alloc)) + return alloc; + } else if (policy == ALLOCATE_NONE) + return alloc; + + test_free(alloc); + + if (time_after(jiffies, resched_after)) + cond_resched(); + } while (time_before(jiffies, timeout)); + + KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE"); + return NULL; /* Unreachable. */ +} + +static void test_out_of_bounds_read(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_read, + .is_write = false, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* + * If we don't have our own cache, adjust based on alignment, so that we + * actually access guard pages on either side. + */ + if (!test_cache) + size = kmalloc_cache_alignment(size); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf + size; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_out_of_bounds_write(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_out_of_bounds_write, + .is_write = true, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + test_free(buf); +} + +static void test_use_after_free_read(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_use_after_free_read, + .is_write = false, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_double_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_double_free, + }; + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(expect.addr); + test_free(expect.addr); /* Double-free. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_invalid_addr_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_INVALID_FREE, + .fn = test_invalid_addr_free, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + expect.addr = buf + 1; /* Free on invalid address. */ + test_free(expect.addr); /* Invalid address free. */ + test_free(buf); /* No error. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static void test_corruption(struct kunit *test) +{ + size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_corruption, + }; + char *buf; + + setup_test_cache(test, size, 0, NULL); + + /* Test both sides. */ + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT); + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + expect.addr = buf - 1; + WRITE_ONCE(*expect.addr, 42); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * KFENCE is unable to detect an OOB if the allocation's alignment requirements + * leave a gap between the object and the guard page. Specifically, an + * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB + * respectively. Therefore it is impossible for the allocated object to + * contiguously line up with the right guard page. + * + * However, we test that an access to memory beyond the gap results in KFENCE + * detecting an OOB access. + */ +static void test_kmalloc_aligned_oob_read(struct kunit *test) +{ + const size_t size = 73; + const size_t align = kmalloc_cache_alignment(size); + struct expect_report expect = { + .type = KFENCE_ERROR_OOB, + .fn = test_kmalloc_aligned_oob_read, + .is_write = false, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + + /* + * The object is offset to the right, so there won't be an OOB to the + * left of it. + */ + READ_ONCE(*(buf - 1)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* + * @buf must be aligned on @align, therefore buf + size belongs to the + * same page -> no OOB. + */ + READ_ONCE(*(buf + size)); + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Overflowing by @align bytes will result in an OOB. */ + expect.addr = buf + size + align; + READ_ONCE(*expect.addr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + + test_free(buf); +} + +static void test_kmalloc_aligned_oob_write(struct kunit *test) +{ + const size_t size = 73; + struct expect_report expect = { + .type = KFENCE_ERROR_CORRUPTION, + .fn = test_kmalloc_aligned_oob_write, + }; + char *buf; + + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT); + /* + * The object is offset to the right, so we won't get a page + * fault immediately after it. + */ + expect.addr = buf + size; + WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1); + KUNIT_EXPECT_FALSE(test, report_available()); + test_free(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test cache shrinking and destroying with KFENCE. */ +static void test_shrink_memcache(struct kunit *test) +{ + const size_t size = 32; + void *buf; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + kmem_cache_shrink(test_cache); + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void ctor_set_x(void *obj) +{ + /* Every object has at least 8 bytes. */ + memset(obj, 'x', 8); +} + +/* Ensure that SL*B does not modify KFENCE objects on bulk free. */ +static void test_free_bulk(struct kunit *test) +{ + int iter; + + for (iter = 0; iter < 5; iter++) { + const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0, + (iter & 1) ? ctor_set_x : NULL); + void *objects[] = { + test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), + }; + + kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects); + KUNIT_ASSERT_FALSE(test, report_available()); + test_cache_destroy(); + } +} + +/* Test init-on-free works. */ +static void test_init_on_free(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_init_on_free, + .is_write = false, + }; + int i; + + KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON)); + /* Assume it hasn't been disabled on command line. */ + + setup_test_cache(test, size, 0, NULL); + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + expect.addr[i] = i + 1; + test_free(expect.addr); + + for (i = 0; i < size; i++) { + /* + * This may fail if the page was recycled by KFENCE and then + * written to again -- this however, is near impossible with a + * default config. + */ + KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0); + + if (!i) /* Only check first access to not fail test if page is ever re-protected. */ + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + } +} + +/* Ensure that constructors work properly. */ +static void test_memcache_ctor(struct kunit *test) +{ + const size_t size = 32; + char *buf; + int i; + + setup_test_cache(test, size, 0, ctor_set_x); + buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + + for (i = 0; i < 8; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)'x'); + + test_free(buf); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* Test that memory is zeroed if requested. */ +static void test_gfpzero(struct kunit *test) +{ + const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */ + char *buf1, *buf2; + int i; + + /* Skip if we think it'd take too long. */ + KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100); + + setup_test_cache(test, size, 0, NULL); + buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + for (i = 0; i < size; i++) + buf1[i] = i + 1; + test_free(buf1); + + /* Try to get same address again -- this can take a while. */ + for (i = 0;; i++) { + buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY); + if (buf1 == buf2) + break; + test_free(buf2); + + if (kthread_should_stop() || (i == CONFIG_KFENCE_NUM_OBJECTS)) { + kunit_warn(test, "giving up ... cannot get same object back\n"); + return; + } + cond_resched(); + } + + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf2[i], (char)0); + + test_free(buf2); + + KUNIT_EXPECT_FALSE(test, report_available()); +} + +static void test_invalid_access(struct kunit *test) +{ + const struct expect_report expect = { + .type = KFENCE_ERROR_INVALID, + .fn = test_invalid_access, + .addr = &__kfence_pool[10], + .is_write = false, + }; + + READ_ONCE(__kfence_pool[10]); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test SLAB_TYPESAFE_BY_RCU works. */ +static void test_memcache_typesafe_by_rcu(struct kunit *test) +{ + const size_t size = 32; + struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_memcache_typesafe_by_rcu, + .is_write = false, + }; + + setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + + expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + *expect.addr = 42; + + rcu_read_lock(); + test_free(expect.addr); + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + /* + * Up to this point, memory should not have been freed yet, and + * therefore there should be no KFENCE report from the above access. + */ + rcu_read_unlock(); + + /* Above access to @expect.addr should not have generated a report! */ + KUNIT_EXPECT_FALSE(test, report_available()); + + /* Only after rcu_barrier() is the memory guaranteed to be freed. */ + rcu_barrier(); + + /* Expect use-after-free. */ + KUNIT_EXPECT_EQ(test, *expect.addr, (char)42); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test krealloc(). */ +static void test_krealloc(struct kunit *test) +{ + const size_t size = 32; + const struct expect_report expect = { + .type = KFENCE_ERROR_UAF, + .fn = test_krealloc, + .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY), + .is_write = false, + }; + char *buf = expect.addr; + int i; + + KUNIT_EXPECT_FALSE(test, test_cache); + KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */ + for (i = 0; i < size; i++) + buf[i] = i + 1; + + /* Check that we successfully change the size. */ + buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */ + /* Note: Might no longer be a KFENCE alloc. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 3); + for (i = 0; i < size; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + for (; i < size * 3; i++) /* Fill to extra bytes. */ + buf[i] = i + 1; + + buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */ + KUNIT_EXPECT_GE(test, ksize(buf), size * 2); + for (i = 0; i < size * 2; i++) + KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1)); + + buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */ + KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR); + KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */ + + READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */ + KUNIT_ASSERT_TRUE(test, report_matches(&expect)); +} + +/* Test that some objects from a bulk allocation belong to KFENCE pool. */ +static void test_memcache_alloc_bulk(struct kunit *test) +{ + const size_t size = 32; + bool pass = false; + unsigned long timeout; + + setup_test_cache(test, size, 0, NULL); + KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */ + /* + * 100x the sample interval should be more than enough to ensure we get + * a KFENCE allocation eventually. + */ + timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval); + do { + void *objects[100]; + int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects), + objects); + if (!num) + continue; + for (i = 0; i < ARRAY_SIZE(objects); i++) { + if (is_kfence_address(objects[i])) { + pass = true; + break; + } + } + kmem_cache_free_bulk(test_cache, num, objects); + /* + * kmem_cache_alloc_bulk() disables interrupts, and calling it + * in a tight loop may not give KFENCE a chance to switch the + * static branch. Call cond_resched() to let KFENCE chime in. + */ + cond_resched(); + } while (!pass && time_before(jiffies, timeout)); + + KUNIT_EXPECT_TRUE(test, pass); + KUNIT_EXPECT_FALSE(test, report_available()); +} + +/* + * KUnit does not provide a way to provide arguments to tests, and we encode + * additional info in the name. Set up 2 tests per test case, one using the + * default allocator, and another using a custom memcache (suffix '-memcache'). + */ +#define KFENCE_KUNIT_CASE(test_name) \ + { .run_case = test_name, .name = #test_name }, \ + { .run_case = test_name, .name = #test_name "-memcache" } + +static struct kunit_case kfence_test_cases[] = { + KFENCE_KUNIT_CASE(test_out_of_bounds_read), + KFENCE_KUNIT_CASE(test_out_of_bounds_write), + KFENCE_KUNIT_CASE(test_use_after_free_read), + KFENCE_KUNIT_CASE(test_double_free), + KFENCE_KUNIT_CASE(test_invalid_addr_free), + KFENCE_KUNIT_CASE(test_corruption), + KFENCE_KUNIT_CASE(test_free_bulk), + KFENCE_KUNIT_CASE(test_init_on_free), + KUNIT_CASE(test_kmalloc_aligned_oob_read), + KUNIT_CASE(test_kmalloc_aligned_oob_write), + KUNIT_CASE(test_shrink_memcache), + KUNIT_CASE(test_memcache_ctor), + KUNIT_CASE(test_invalid_access), + KUNIT_CASE(test_gfpzero), + KUNIT_CASE(test_memcache_typesafe_by_rcu), + KUNIT_CASE(test_krealloc), + KUNIT_CASE(test_memcache_alloc_bulk), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + int i; + + if (!__kfence_pool) + return -EINVAL; + + spin_lock_irqsave(&observed.lock, flags); + for (i = 0; i < ARRAY_SIZE(observed.lines); i++) + observed.lines[i][0] = '\0'; + observed.nlines = 0; + spin_unlock_irqrestore(&observed.lock, flags); + + /* Any test with 'memcache' in its name will want a memcache. */ + if (strstr(test->name, "memcache")) + test->priv = TEST_PRIV_WANT_MEMCACHE; + else + test->priv = NULL; + + return 0; +} + +static void test_exit(struct kunit *test) +{ + test_cache_destroy(); +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kfence_suite_init(struct kunit_suite *suite) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kfence_suite_exit(struct kunit_suite *suite) +{ + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static struct kunit_suite kfence_test_suite = { + .name = "kfence", + .test_cases = kfence_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kfence_suite_init, + .suite_exit = kfence_suite_exit, +}; + +kunit_test_suites(&kfence_test_suite); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Alexander Potapenko , Marco Elver "); diff --git a/mm/kfence/report.c b/mm/kfence/report.c new file mode 100644 index 000000000..60205f125 --- /dev/null +++ b/mm/kfence/report.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KFENCE reporting. + * + * Copyright (C) 2020, Google LLC. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kfence.h" + +/* May be overridden by . */ +#ifndef ARCH_FUNC_PREFIX +#define ARCH_FUNC_PREFIX "" +#endif + +extern bool no_hash_pointers; + +/* Helper function to either print to a seq_file or to console. */ +__printf(2, 3) +static void seq_con_printf(struct seq_file *seq, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + if (seq) + seq_vprintf(seq, fmt, args); + else + vprintk(fmt, args); + va_end(args); +} + +/* + * Get the number of stack entries to skip to get out of MM internals. @type is + * optional, and if set to NULL, assumes an allocation or free stack. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries, + const enum kfence_error_type *type) +{ + char buf[64]; + int skipnr, fallback = 0; + + if (type) { + /* Depending on error type, find different stack entries. */ + switch (*type) { + case KFENCE_ERROR_UAF: + case KFENCE_ERROR_OOB: + case KFENCE_ERROR_INVALID: + /* + * kfence_handle_page_fault() may be called with pt_regs + * set to NULL; in that case we'll simply show the full + * stack trace. + */ + return 0; + case KFENCE_ERROR_CORRUPTION: + case KFENCE_ERROR_INVALID_FREE: + break; + } + } + + for (skipnr = 0; skipnr < num_entries; skipnr++) { + int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]); + + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || + !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { + /* + * In case of tail calls from any of the below to any of + * the above, optimized by the compiler such that the + * stack trace would omit the initial entry point below. + */ + fallback = skipnr + 1; + } + + /* + * The below list should only include the initial entry points + * into the slab allocators. Includes the *_bulk() variants by + * checking prefixes. + */ + if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) + goto found; + } + if (fallback < num_entries) + return fallback; +found: + skipnr++; + return skipnr < num_entries ? skipnr : 0; +} + +static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta, + bool show_alloc) +{ + const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track; + u64 ts_sec = track->ts_nsec; + unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC); + + /* Timestamp matches printk timestamp format. */ + seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n", + show_alloc ? "allocated" : "freed", track->pid, + track->cpu, (unsigned long)ts_sec, rem_nsec / 1000); + + if (track->num_stack_entries) { + /* Skip allocation/free internals stack. */ + int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + + /* stack_trace_seq_print() does not exist; open code our own. */ + for (; i < track->num_stack_entries; i++) + seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]); + } else { + seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation"); + } +} + +void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta) +{ + const int size = abs(meta->size); + const unsigned long start = meta->addr; + const struct kmem_cache *const cache = meta->cache; + + lockdep_assert_held(&meta->lock); + + if (meta->state == KFENCE_OBJECT_UNUSED) { + seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata); + return; + } + + seq_con_printf(seq, "kfence-#%td: 0x%p-0x%p, size=%d, cache=%s\n\n", + meta - kfence_metadata, (void *)start, (void *)(start + size - 1), + size, (cache && cache->name) ? cache->name : ""); + + kfence_print_stack(seq, meta, true); + + if (meta->state == KFENCE_OBJECT_FREED) { + seq_con_printf(seq, "\n"); + kfence_print_stack(seq, meta, false); + } +} + +/* + * Show bytes at @addr that are different from the expected canary values, up to + * @max_bytes. + */ +static void print_diff_canary(unsigned long address, size_t bytes_to_show, + const struct kfence_metadata *meta) +{ + const unsigned long show_until_addr = address + bytes_to_show; + const u8 *cur, *end; + + /* Do not show contents of object nor read into following guard page. */ + end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr) + : min(show_until_addr, PAGE_ALIGN(address))); + + pr_cont("["); + for (cur = (const u8 *)address; cur < end; cur++) { + if (*cur == KFENCE_CANARY_PATTERN(cur)) + pr_cont(" ."); + else if (no_hash_pointers) + pr_cont(" 0x%02x", *cur); + else /* Do not leak kernel memory in non-debug builds. */ + pr_cont(" !"); + } + pr_cont(" ]"); +} + +static const char *get_access_type(bool is_write) +{ + return is_write ? "write" : "read"; +} + +void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, + const struct kfence_metadata *meta, enum kfence_error_type type) +{ + unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 }; + const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1; + int num_stack_entries; + int skipnr = 0; + + if (regs) { + num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0); + } else { + num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type); + } + + /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */ + if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta)) + return; + + if (meta) + lockdep_assert_held(&meta->lock); + /* + * Because we may generate reports in printk-unfriendly parts of the + * kernel, such as scheduler code, the use of printk() could deadlock. + * Until such time that all printing code here is safe in all parts of + * the kernel, accept the risk, and just get our message out (given the + * system might already behave unpredictably due to the memory error). + * As such, also disable lockdep to hide warnings, and avoid disabling + * lockdep for the rest of the kernel. + */ + lockdep_off(); + + pr_err("==================================================================\n"); + /* Print report header. */ + switch (type) { + case KFENCE_ERROR_OOB: { + const bool left_of_object = address < meta->addr; + + pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n", + get_access_type(is_write), (void *)address, + left_of_object ? meta->addr - address : address - meta->addr, + left_of_object ? "left" : "right", object_index); + break; + } + case KFENCE_ERROR_UAF: + pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n", + get_access_type(is_write), (void *)address, object_index); + break; + case KFENCE_ERROR_CORRUPTION: + pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Corrupted memory at 0x%p ", (void *)address); + print_diff_canary(address, 16, meta); + pr_cont(" (in kfence-#%td):\n", object_index); + break; + case KFENCE_ERROR_INVALID: + pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write), + (void *)stack_entries[skipnr]); + pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write), + (void *)address); + break; + case KFENCE_ERROR_INVALID_FREE: + pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]); + pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address, + object_index); + break; + } + + /* Print stack trace and object info. */ + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0); + + if (meta) { + pr_err("\n"); + kfence_print_object(NULL, meta); + } + + /* Print report footer. */ + pr_err("\n"); + if (no_hash_pointers && regs) + show_regs(regs); + else + dump_stack_print_info(KERN_ERR); + trace_error_report_end(ERROR_DETECTOR_KFENCE, address); + pr_err("==================================================================\n"); + + lockdep_on(); + + check_panic_on_warn("KFENCE"); + + /* We encountered a memory safety error, taint the kernel! */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); +} + +#ifdef CONFIG_PRINTK +static void kfence_to_kp_stack(const struct kfence_track *track, void **kp_stack) +{ + int i, j; + + i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL); + for (j = 0; i < track->num_stack_entries && j < KS_ADDRS_COUNT; ++i, ++j) + kp_stack[j] = (void *)track->stack_entries[i]; + if (j < KS_ADDRS_COUNT) + kp_stack[j] = NULL; +} + +bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + struct kfence_metadata *meta = addr_to_metadata((unsigned long)object); + unsigned long flags; + + if (!meta) + return false; + + /* + * If state is UNUSED at least show the pointer requested; the rest + * would be garbage data. + */ + kpp->kp_ptr = object; + + /* Requesting info an a never-used object is almost certainly a bug. */ + if (WARN_ON(meta->state == KFENCE_OBJECT_UNUSED)) + return true; + + raw_spin_lock_irqsave(&meta->lock, flags); + + kpp->kp_slab = slab; + kpp->kp_slab_cache = meta->cache; + kpp->kp_objp = (void *)meta->addr; + kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack); + if (meta->state == KFENCE_OBJECT_FREED) + kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack); + /* get_stack_skipnr() ensures the first entry is outside allocator. */ + kpp->kp_ret = kpp->kp_stack[0]; + + raw_spin_unlock_irqrestore(&meta->lock, flags); + + return true; +} +#endif diff --git a/mm/khugepaged.c b/mm/khugepaged.c new file mode 100644 index 000000000..65bd0b105 --- /dev/null +++ b/mm/khugepaged.c @@ -0,0 +1,2738 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "internal.h" +#include "mm_slot.h" + +enum scan_result { + SCAN_FAIL, + SCAN_SUCCEED, + SCAN_PMD_NULL, + SCAN_PMD_NONE, + SCAN_PMD_MAPPED, + SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, + SCAN_PTE_NON_PRESENT, + SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, + SCAN_PAGE_RO, + SCAN_LACK_REFERENCED_PAGE, + SCAN_PAGE_NULL, + SCAN_SCAN_ABORT, + SCAN_PAGE_COUNT, + SCAN_PAGE_LRU, + SCAN_PAGE_LOCK, + SCAN_PAGE_ANON, + SCAN_PAGE_COMPOUND, + SCAN_ANY_PROCESS, + SCAN_VMA_NULL, + SCAN_VMA_CHECK, + SCAN_ADDRESS_RANGE, + SCAN_DEL_PAGE_LRU, + SCAN_ALLOC_HUGE_PAGE_FAIL, + SCAN_CGROUP_CHARGE_FAIL, + SCAN_TRUNCATED, + SCAN_PAGE_HAS_PRIVATE, +}; + +#define CREATE_TRACE_POINTS +#include + +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); + +/* default scan 8*512 pte (or vmas) every 30 second */ +static unsigned int khugepaged_pages_to_scan __read_mostly; +static unsigned int khugepaged_pages_collapsed; +static unsigned int khugepaged_full_scans; +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; +/* during fragmentation poll the hugepage allocator once every minute */ +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static unsigned long khugepaged_sleep_expire; +static DEFINE_SPINLOCK(khugepaged_mm_lock); +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); +/* + * default collapse hugepages if there is at least one pte mapped like + * it would have happened if the vma was large enough during page + * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. + */ +static unsigned int khugepaged_max_ptes_none __read_mostly; +static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; + +#define MM_SLOTS_HASH_BITS 10 +static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + +static struct kmem_cache *mm_slot_cache __read_mostly; + +#define MAX_PTE_MAPPED_THP 8 + +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ + u32 node_load[MAX_NUMNODES]; + + /* nodemask for allocation fallback */ + nodemask_t alloc_nmask; +}; + +/** + * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot + * @nr_pte_mapped_thp: number of pte mapped THP + * @pte_mapped_thp: address array corresponding pte mapped THP + */ +struct khugepaged_mm_slot { + struct mm_slot slot; + + /* pte-mapped THP in this mm */ + int nr_pte_mapped_thp; + unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; +}; + +/** + * struct khugepaged_scan - cursor for scanning + * @mm_head: the head of the mm list to scan + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * + * There is only the one khugepaged_scan instance of this cursor structure. + */ +struct khugepaged_scan { + struct list_head mm_head; + struct khugepaged_mm_slot *mm_slot; + unsigned long address; +}; + +static struct khugepaged_scan khugepaged_scan = { + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +}; + +#ifdef CONFIG_SYSFS +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int msecs; + int err; + + err = kstrtouint(buf, 10, &msecs); + if (err) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR_RW(scan_sleep_millisecs); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int msecs; + int err; + + err = kstrtouint(buf, 10, &msecs); + if (err) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR_RW(alloc_sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int pages; + int err; + + err = kstrtouint(buf, 10, &pages); + if (err || !pages) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR_RW(pages_to_scan); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_hugepage_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_hugepage_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR_RW(defrag); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = kstrtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR - 1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR_RW(max_ptes_none); + +static ssize_t max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); +} + +static ssize_t max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_swap; + + err = kstrtoul(buf, 10, &max_ptes_swap); + if (err || max_ptes_swap > HPAGE_PMD_NR - 1) + return -EINVAL; + + khugepaged_max_ptes_swap = max_ptes_swap; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_swap_attr = + __ATTR_RW(max_ptes_swap); + +static ssize_t max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR - 1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR_RW(max_ptes_shared); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; +#endif /* CONFIG_SYSFS */ + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + khugepaged_enter_vma(vma, *vm_flags); + break; + case MADV_NOHUGEPAGE: + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +int __init khugepaged_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct khugepaged_mm_slot), + __alignof__(struct khugepaged_mm_slot), + 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; + khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; + + return 0; +} + +void __init khugepaged_destroy(void) +{ + kmem_cache_destroy(mm_slot_cache); +} + +static inline int hpage_collapse_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +void __khugepaged_enter(struct mm_struct *mm) +{ + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; + int wakeup; + + mm_slot = mm_slot_alloc(mm_slot_cache); + if (!mm_slot) + return; + + slot = &mm_slot->slot; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + mm_slot_free(mm_slot_cache, mm_slot); + return; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot_insert(mm_slots_hash, mm, slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + mmgrab(mm); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); +} + +void khugepaged_enter_vma(struct vm_area_struct *vma, + unsigned long vm_flags) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + hugepage_flags_enabled()) { + if (hugepage_vma_check(vma, vm_flags, false, false, true)) + __khugepaged_enter(vma->vm_mm); + } +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hash_del(&slot->hash); + list_del(&slot->mm_node); + free = 1; + } + spin_unlock(&khugepaged_mm_lock); + + if (free) { + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_slot_free(mm_slot_cache, mm_slot); + mmdrop(mm); + } else if (mm_slot) { + /* + * This is required to serialize against + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. + */ + mmap_write_lock(mm); + mmap_write_unlock(mm); + } +} + +static void release_pte_page(struct page *page) +{ + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + -compound_nr(page)); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte, + struct list_head *compound_pagelist) +{ + struct page *page, *tmp; + + while (--_pte >= pte) { + pte_t pteval = *_pte; + + page = pte_page(pteval); + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) && + !PageCompound(page)) + release_pte_page(page); + } + + list_for_each_entry_safe(page, tmp, compound_pagelist, lru) { + list_del(&page->lru); + release_pte_page(page); + } +} + +static bool is_refcount_suitable(struct page *page) +{ + int expected_refcount; + + expected_refcount = total_mapcount(page); + if (PageSwapCache(page)) + expected_refcount += compound_nr(page); + + return page_count(page) == expected_refcount; +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte, + struct collapse_control *cc, + struct list_head *compound_pagelist) +{ + struct page *page = NULL; + pte_t *_pte; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; + bool writable = false; + + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval) || (pte_present(pteval) && + is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out; + } + } + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; + goto out; + } + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { + result = SCAN_PAGE_NULL; + goto out; + } + + VM_BUG_ON_PAGE(!PageAnon(page), page); + + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } + } + + if (PageCompound(page)) { + struct page *p; + page = compound_head(page); + + /* + * Check if we have dealt with the compound page + * already + */ + list_for_each_entry(p, compound_pagelist, lru) { + if (page == p) + goto next; + } + } + + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; + goto out; + } + + /* + * Check if the page has any GUP (or other external) pins. + * + * The page table that maps the page has been already unlinked + * from the page table tree and this process cannot get + * an additional pin on the page. + * + * New pins can come later if the page is shared across fork, + * but not from this process. The other process cannot write to + * the page, only trigger CoW. + */ + if (!is_refcount_suitable(page)) { + unlock_page(page); + result = SCAN_PAGE_COUNT; + goto out; + } + + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + result = SCAN_DEL_PAGE_LRU; + goto out; + } + mod_node_page_state(page_pgdat(page), + NR_ISOLATED_ANON + page_is_file_lru(page), + compound_nr(page)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageLRU(page), page); + + if (PageCompound(page)) + list_add_tail(&page->lru, compound_pagelist); +next: + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) + referenced++; + + if (pte_write(pteval)) + writable = true; + } + + if (unlikely(!writable)) { + result = SCAN_PAGE_RO; + } else if (unlikely(cc->is_khugepaged && !referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return result; + } +out: + release_pte_pages(pte, _pte, compound_pagelist); + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return result; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl, + struct list_head *compound_pagelist) +{ + struct page *src_page, *tmp; + pte_t *_pte; + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, page++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + if (!PageCompound(src_page)) + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page, vma, false); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + } + + list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { + list_del(&src_page->lru); + mod_node_page_state(page_pgdat(src_page), + NR_ISOLATED_ANON + page_is_file_lru(src_page), + -compound_nr(src_page)); + unlock_page(src_page); + free_swap_cache(src_page); + putback_lru_page(src_page); + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + + add_wait_queue(&khugepaged_wait, &wait); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, +}; + +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) +{ + int i; + + /* + * If node_reclaim_mode is disabled, then no extra effort is made to + * allocate memory locally. + */ + if (!node_reclaim_enabled()) + return false; + + /* If there is a count for this node already, it must be acceptable */ + if (cc->node_load[nid]) + return false; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!cc->node_load[i]) + continue; + if (node_distance(nid, i) > node_reclaim_distance) + return true; + } + return false; +} + +#define khugepaged_defrag() \ + (transparent_hugepage_flags & \ + (1<node_load[nid] > max_value) { + max_value = cc->node_load[nid]; + target_node = nid; + } + + for_each_online_node(nid) { + if (max_value == cc->node_load[nid]) + node_set(nid, cc->alloc_nmask); + } + + return target_node; +} +#else +static int hpage_collapse_find_target_node(struct collapse_control *cc) +{ + return 0; +} +#endif + +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, + nodemask_t *nmask) +{ + *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask); + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + return false; + } + + prep_transhuge_page(*hpage); + count_vm_event(THP_COLLAPSE_ALLOC); + return true; +} + +/* + * If mmap_lock temporarily dropped, revalidate vma + * before taking mmap_lock. + * Returns enum scan_result value. + */ + +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, + struct vm_area_struct **vmap, + struct collapse_control *cc) +{ + struct vm_area_struct *vma; + + if (unlikely(hpage_collapse_test_exit(mm))) + return SCAN_ANY_PROCESS; + + *vmap = vma = find_vma(mm, address); + if (!vma) + return SCAN_VMA_NULL; + + if (!transhuge_vma_suitable(vma, address)) + return SCAN_ADDRESS_RANGE; + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) + return SCAN_VMA_CHECK; + /* + * Anon VMA expected, the address may be unmapped then + * remapped to file after khugepaged reaquired the mmap_lock. + * + * hugepage_vma_check may return true for qualified file + * vmas. + */ + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; + return SCAN_SUCCEED; +} + +/* + * See pmd_trans_unstable() for how the result may change out from + * underneath us, even if we hold mmap_lock in read. + */ +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmd_read_atomic(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (pmd_none(pmde)) + return SCAN_PMD_NONE; + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_devmap(pmde)) + return SCAN_PMD_NULL; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; +} + +/* + * Bring missing pages in from swap, to complete THP collapse. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. + * + * Called and returns without pte mapped or spinlocks held. + * Note that if false is returned, mmap_lock will be released. + */ + +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) +{ + int swapped_in = 0; + vm_fault_t ret = 0; + unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + + for (address = haddr; address < end; address += PAGE_SIZE) { + struct vm_fault vmf = { + .vma = vma, + .address = address, + .pgoff = linear_page_index(vma, haddr), + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + vmf.pte = pte_offset_map(pmd, address); + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) { + pte_unmap(vmf.pte); + continue; + } + ret = do_swap_page(&vmf); + + /* + * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. + * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because + * we do not retry here and swap entry will remain in pagetable + * resulting in later failure. + */ + if (ret & VM_FAULT_RETRY) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; + } + if (ret & VM_FAULT_ERROR) { + mmap_read_unlock(mm); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return SCAN_FAIL; + } + swapped_in++; + } + + /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ + if (swapped_in) + lru_add_drain(); + + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); + return SCAN_SUCCEED; +} + +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE); + int node = hpage_collapse_find_target_node(cc); + + if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) +{ + LIST_HEAD(compound_pagelist); + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *hpage; + spinlock_t *pmd_ptl, *pte_ptl; + int result = SCAN_FAIL; + struct vm_area_struct *vma; + struct mmu_notifier_range range; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + /* + * Before allocating the hugepage, release the mmap_lock read lock. + * The allocation can take potentially a long time if it involves + * sync compaction, and we do not need to hold the mmap_lock during + * that. We will recheck the vma after taking it again in write mode. + */ + mmap_read_unlock(mm); + + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) + goto out_nolock; + + mmap_read_lock(mm); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) { + mmap_read_unlock(mm); + goto out_nolock; + } + + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { + mmap_read_unlock(mm); + goto out_nolock; + } + + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; + } + + mmap_read_unlock(mm); + /* + * Prevent all access to pagetables with the exception of + * gup_fast later handled by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + mmap_write_lock(mm); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) + goto out_up_write; + /* check if the pmd is still valid */ + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) + goto out_up_write; + + anon_vma_lock_write(vma->anon_vma); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address, address + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + + pte = pte_offset_map(pmd, address); + pte_ptl = pte_lockptr(mm, pmd); + + pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ + /* + * This removes any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address to + * avoid the risk of CPU bugs in that area. + * + * Parallel fast GUP is fine since fast GUP will back off when + * it detects PMD is changed. + */ + _pmd = pmdp_collapse_flush(vma, address, pmd); + spin_unlock(pmd_ptl); + mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); + + spin_lock(pte_ptl); + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); + spin_unlock(pte_ptl); + + if (unlikely(result != SCAN_SUCCEED)) { + pte_unmap(pte); + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + spin_unlock(pmd_ptl); + anon_vma_unlock_write(vma->anon_vma); + goto out_up_write; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock_write(vma->anon_vma); + + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); + pte_unmap(pte); + /* + * spin_lock() below is not the equivalent of smp_wmb(), but + * the smp_wmb() inside __SetPageUptodate() can be reused to + * avoid the copy_huge_page writes to become visible after + * the set_pmd_at() write. + */ + __SetPageUptodate(hpage); + pgtable = pmd_pgtable(_pmd); + + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache_pmd(vma, address, pmd); + spin_unlock(pmd_ptl); + + hpage = NULL; + + result = SCAN_SUCCEED; +out_up_write: + mmap_write_unlock(mm); +out_nolock: + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; +} + +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) +{ + pmd_t *pmd; + pte_t *pte, *_pte; + int result = SCAN_FAIL, referenced = 0; + int none_or_zero = 0, shared = 0; + struct page *page = NULL; + unsigned long _address; + spinlock_t *ptl; + int node = NUMA_NO_NODE, unmapped = 0; + bool writable = false; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) + goto out; + + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (is_swap_pte(pteval)) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { + /* + * Always be strict with uffd-wp + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ + if (pte_swp_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } + continue; + } else { + result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); + goto out_unmap; + } + } + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out_unmap; + } + } + if (pte_uffd_wp(pteval)) { + /* + * Don't collapse the page if any of the small + * PTEs are armed with uffd write protection. + * Here we can also mark the new huge pmd as + * write protected if any of the small ones is + * marked but that could bring unknown + * userfault messages that falls outside of + * the registered range. So, just be simple. + */ + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } + if (pte_write(pteval)) + writable = true; + + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { + result = SCAN_PAGE_NULL; + goto out_unmap; + } + + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } + } + + page = compound_head(page); + + /* + * Record which node the original page is from and save this + * information to cc->node_load[]. + * Khugepaged will allocate hugepage from the node has the max + * hit record. + */ + node = page_to_nid(page); + if (hpage_collapse_scan_abort(node, cc)) { + result = SCAN_SCAN_ABORT; + goto out_unmap; + } + cc->node_load[node]++; + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + goto out_unmap; + } + if (PageLocked(page)) { + result = SCAN_PAGE_LOCK; + goto out_unmap; + } + if (!PageAnon(page)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } + + /* + * Check if the page has any GUP (or other external) pins. + * + * Here the check is racy it may see total_mapcount > refcount + * in some cases. + * For example, one process with one forked child process. + * The parent has the PMD split due to MADV_DONTNEED, then + * the child is trying unmap the whole PMD, but khugepaged + * may be scanning the parent between the child has + * PageDoubleMap flag cleared and dec the mapcount. So + * khugepaged may see total_mapcount > refcount. + * + * But such case is ephemeral we could always retry collapse + * later. However it may report false positive if the page + * has excessive GUP pins (i.e. 512). Anyway the same check + * will be done again later the risk seems low. + */ + if (!is_refcount_suitable(page)) { + result = SCAN_PAGE_COUNT; + goto out_unmap; + } + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) + referenced++; + } + if (!writable) { + result = SCAN_PAGE_RO; + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + } +out_unmap: + pte_unmap_unlock(pte, ptl); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); + /* collapse_huge_page will return with the mmap_lock released */ + *mmap_locked = false; + } +out: + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + none_or_zero, result, unmapped); + return result; +} + +static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) +{ + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; + + lockdep_assert_held(&khugepaged_mm_lock); + + if (hpage_collapse_test_exit(mm)) { + /* free mm_slot */ + hash_del(&slot->hash); + list_del(&slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + mm_slot_free(mm_slot_cache, mm_slot); + mmdrop(mm); + } +} + +#ifdef CONFIG_SHMEM +/* + * Notify khugepaged that given addr of the mm is pte-mapped THP. Then + * khugepaged should try to collapse the page table. + * + * Note that following race exists: + * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, + * emptying the A's ->pte_mapped_thp[] array. + * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and + * retract_page_tables() finds a VMA in mm_struct A mapping the same extent + * (at virtual address X) and adds an entry (for X) into mm_struct A's + * ->pte-mapped_thp[] array. + * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, + * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry + * (for X) into mm_struct A's ->pte-mapped_thp[] array. + * Thus, it's possible the same address is added multiple times for the same + * mm_struct. Should this happen, we'll simply attempt + * collapse_pte_mapped_thp() multiple times for the same address, under the same + * exclusive mmap_lock, and assuming the first call is successful, subsequent + * attempts will return quickly (without grabbing any additional locks) when + * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap + * check, and since this is a rare occurrence, the cost of preventing this + * "multiple-add" is thought to be more expensive than just handling it, should + * it occur. + */ +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; + bool ret = false; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + + spin_lock(&khugepaged_mm_lock); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { + mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + ret = true; + } + spin_unlock(&khugepaged_mm_lock); + return ret; +} + +/* hpage must be locked, and mmap_lock must be held in write */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + .pmd = pmdp, + }; + + VM_BUG_ON(!PageTransHuge(hpage)); + mmap_assert_write_locked(vma->vm_mm); + + if (do_set_pmd(&vmf, hpage)) + return SCAN_FAIL; + + get_page(hpage); + return SCAN_SUCCEED; +} + +/* + * A note about locking: + * Trying to take the page table spinlocks would be useless here because those + * are only used to synchronize: + * + * - modifying terminal entries (ones that point to a data page, not to another + * page table) + * - installing *new* non-terminal entries + * + * Instead, we need roughly the same kind of protection as free_pgtables() or + * mm_take_all_locks() (but only for a single VMA): + * The mmap lock together with this VMA's rmap locks covers all paths towards + * the page table entries we're messing with here, except for hardware page + * table walks and lockless_pages_from_mm(). + */ +static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + struct mmu_notifier_range range; + + mmap_assert_write_locked(mm); + if (vma->vm_file) + lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); + /* + * All anon_vmas attached to the VMA have the same root and are + * therefore locked by the same lock. + */ + if (vma->anon_vma) + lockdep_assert_held_write(&vma->anon_vma->root->rwsem); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + pmd = pmdp_collapse_flush(vma, addr, pmdp); + tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pmd); + pte_free(mm, pmd_pgtable(pmd)); +} + +/** + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. Possibly install a huge PMD mapping the THP. + */ +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) +{ + unsigned long haddr = addr & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vma_lookup(mm, haddr); + struct page *hpage; + pte_t *start_pte, *pte; + pmd_t *pmd; + spinlock_t *ptl; + int count = 0, result = SCAN_FAIL; + int i; + + mmap_assert_write_locked(mm); + + /* Fast check before locking page if already PMD-mapped */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result == SCAN_PMD_MAPPED) + return result; + + if (!vma || !vma->vm_file || + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) + return SCAN_VMA_CHECK; + + /* + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. + */ + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return SCAN_VMA_CHECK; + + /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ + if (userfaultfd_wp(vma)) + return SCAN_PTE_UFFD_WP; + + hpage = find_lock_page(vma->vm_file->f_mapping, + linear_page_index(vma, haddr)); + if (!hpage) + return SCAN_PAGE_NULL; + + if (!PageHead(hpage)) { + result = SCAN_FAIL; + goto drop_hpage; + } + + if (compound_order(hpage) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; + goto drop_hpage; + } + + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_PMD_NONE: + /* + * In MADV_COLLAPSE path, possible race with khugepaged where + * all pte entries have been removed and pmd cleared. If so, + * skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: + goto drop_hpage; + } + + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + * See collapse_and_free_pmd(). + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + result = SCAN_FAIL; + + /* step 1: check all mapped PTEs are to the right huge page */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + /* empty pte, skip */ + if (pte_none(*pte)) + continue; + + /* page swapped out, abort */ + if (!pte_present(*pte)) { + result = SCAN_PTE_NON_PRESENT; + goto abort; + } + + page = vm_normal_page(vma, addr, *pte); + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + page = NULL; + /* + * Note that uprobe, debugger, or MAP_PRIVATE may change the + * page table, but the new page will not be a subpage of hpage. + */ + if (hpage + i != page) + goto abort; + count++; + } + + /* step 2: adjust rmap */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + + if (pte_none(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + goto abort; + page_remove_rmap(page, vma, false); + } + + pte_unmap_unlock(start_pte, ptl); + + /* step 3: set proper refcount and mm_counters. */ + if (count) { + page_ref_sub(hpage, count); + add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + } + + /* step 4: remove pte entries */ + /* we make no change to anon, but protect concurrent anon page lookup */ + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); + + collapse_and_free_pmd(mm, vma, haddr, pmd); + + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, hpage) + : SCAN_SUCCEED; + +drop_hpage: + unlock_page(hpage); + put_page(hpage); + return result; + +abort: + pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + goto drop_hpage; +} + +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) +{ + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; + int i; + + if (likely(mm_slot->nr_pte_mapped_thp == 0)) + return; + + if (!mmap_write_trylock(mm)) + return; + + if (unlikely(hpage_collapse_test_exit(mm))) + goto out; + + for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); + +out: + mm_slot->nr_pte_mapped_thp = 0; + mmap_write_unlock(mm); +} + +static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + struct mm_struct *target_mm, + unsigned long target_addr, struct page *hpage, + struct collapse_control *cc) +{ + struct vm_area_struct *vma; + int target_result = SCAN_FAIL; + + i_mmap_lock_write(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + int result = SCAN_FAIL; + struct mm_struct *mm = NULL; + unsigned long addr = 0; + pmd_t *pmd; + bool is_target = false; + + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth investing + * mmap_write_lock(mm) as PMD-mapping is likely to be split + * later. + * + * Note that vma->anon_vma check is racy: it can be set up after + * the check but before we took mmap_lock by the fault path. + * But page lock would prevent establishing any new ptes of the + * page, so we are safe. + * + * An alternative would be drop the check, but check that page + * table is clear before calling pmdp_collapse_flush() under + * ptl. It has higher chance to recover THP for the VMA, but + * has higher cost too. It would also probably require locking + * the anon_vma. + */ + if (READ_ONCE(vma->anon_vma)) { + result = SCAN_PAGE_ANON; + goto next; + } + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) { + result = SCAN_VMA_CHECK; + goto next; + } + mm = vma->vm_mm; + is_target = mm == target_mm && addr == target_addr; + result = find_pmd_or_thp_or_none(mm, addr, &pmd); + if (result != SCAN_SUCCEED) + goto next; + /* + * We need exclusive mmap_lock to retract page table. + * + * We use trylock due to lock inversion: we need to acquire + * mmap_lock while holding page lock. Fault path does it in + * reverse order. Trylock is a way to avoid deadlock. + * + * Also, it's not MADV_COLLAPSE's job to collapse other + * mappings - let khugepaged take care of them later. + */ + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { + /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no + * ->anon_vma exists or the anon_vma is locked. + * We already checked ->anon_vma above, but that check + * is racy because ->anon_vma can be populated under the + * mmap lock in read mode. + */ + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto unlock_next; + } + /* + * When a vma is registered with uffd-wp, we can't + * recycle the pmd pgtable because there can be pte + * markers installed. Skip it only, so the rest mm/vma + * can still have the same file mapped hugely, however + * it'll always mapped in small page size for uffd-wp + * registered ranges. + */ + if (hpage_collapse_test_exit(mm)) { + result = SCAN_ANY_PROCESS; + goto unlock_next; + } + if (userfaultfd_wp(vma)) { + result = SCAN_PTE_UFFD_WP; + goto unlock_next; + } + collapse_and_free_pmd(mm, vma, addr, pmd); + if (!cc->is_khugepaged && is_target) + result = set_huge_pmd(vma, addr, pmd, hpage); + else + result = SCAN_SUCCEED; + +unlock_next: + mmap_write_unlock(mm); + goto next; + } + /* + * Calling context will handle target mm/addr. Otherwise, let + * khugepaged try again later. + */ + if (!is_target) { + khugepaged_add_pte_mapped_thp(mm, addr); + continue; + } +next: + if (is_target) + target_result = result; + } + i_mmap_unlock_write(mapping); + return target_result; +} + +/** + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. + * + * @mm: process address space where collapse happens + * @addr: virtual collapse start address + * @file: file that collapse on + * @start: collapse start address + * @cc: collapse context and scratchpad + * + * Basic scheme is simple, details are more complex: + * - allocate and lock a new huge page; + * - scan page cache replacing old pages with the new one + * + swap/gup in pages if necessary; + * + fill in gaps; + * + keep old pages around in case rollback is required; + * - if replacing succeeds: + * + copy data over; + * + free old pages; + * + unlock huge page; + * - if replacing failed; + * + put all pages back and unfreeze them; + * + restore gaps in the page cache; + * + unlock and free huge page; + */ +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) +{ + struct address_space *mapping = file->f_mapping; + struct page *hpage; + pgoff_t index, end = start + HPAGE_PMD_NR; + LIST_HEAD(pagelist); + XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); + int nr_none = 0, result = SCAN_SUCCEED; + bool is_shmem = shmem_file(file); + int nr; + + VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); + VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); + + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) + goto out; + + /* + * Ensure we have slots for all the pages in the range. This is + * almost certainly a no-op because most of the pages must be present + */ + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (!xas_error(&xas)) + break; + xas_unlock_irq(&xas); + if (!xas_nomem(&xas, GFP_KERNEL)) { + result = SCAN_FAIL; + goto out; + } + } while (1); + + __SetPageLocked(hpage); + if (is_shmem) + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; + + /* + * At this point the hpage is locked and not up-to-date. + * It's safe to insert it into the page cache, because nobody would + * be able to map it or use it in another way until we unlock it. + */ + + xas_set(&xas, start); + for (index = start; index < end; index++) { + struct page *page = xas_next(&xas); + struct folio *folio; + + VM_BUG_ON(index != xas.xa_index); + if (is_shmem) { + if (!page) { + /* + * Stop if extent has been truncated or + * hole-punched, and is now completely + * empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + xas_set(&xas, index); + } + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; + goto xa_locked; + } + xas_store(&xas, hpage); + nr_none++; + continue; + } + + if (xa_is_value(page) || !PageUptodate(page)) { + xas_unlock_irq(&xas); + /* swap in or instantiate fallocated page */ + if (shmem_get_folio(mapping->host, index, + &folio, SGP_NOALLOC)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + page = folio_file_page(folio, index); + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; + } + } else { /* !is_shmem */ + if (!page || xa_is_value(page)) { + xas_unlock_irq(&xas); + page_cache_sync_readahead(mapping, &file->f_ra, + file, index, + end - index); + /* drain pagevecs to help isolate_lru_page() */ + lru_add_drain(); + page = find_lock_page(mapping, index); + if (unlikely(page == NULL)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (PageDirty(page)) { + /* + * khugepaged only works on read-only fd, + * so this page is dirty because it hasn't + * been flushed since first write. There + * won't be new dirty pages. + * + * Trigger async flush here and hope the + * writeback is done when khugepaged + * revisits this page. + * + * This is a one-off situation. We are not + * forcing writeback in loop. + */ + xas_unlock_irq(&xas); + filemap_flush(mapping); + result = SCAN_FAIL; + goto xa_unlocked; + } else if (PageWriteback(page)) { + xas_unlock_irq(&xas); + result = SCAN_FAIL; + goto xa_unlocked; + } else if (trylock_page(page)) { + get_page(page); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; + } + } + + /* + * The page must be locked, so we can drop the i_pages lock + * without racing with truncate. + */ + VM_BUG_ON_PAGE(!PageLocked(page), page); + + /* make sure the page is up to date */ + if (unlikely(!PageUptodate(page))) { + result = SCAN_FAIL; + goto out_unlock; + } + + /* + * If file was truncated then extended, or hole-punched, before + * we locked the first page, then a THP might be there already. + * This will be discovered on the first iteration. + */ + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + goto out_unlock; + } + + folio = page_folio(page); + + if (folio_mapping(folio) != mapping) { + result = SCAN_TRUNCATED; + goto out_unlock; + } + + if (!is_shmem && (folio_test_dirty(folio) || + folio_test_writeback(folio))) { + /* + * khugepaged only works on read-only fd, so this + * page is dirty because it hasn't been flushed + * since first write. + */ + result = SCAN_FAIL; + goto out_unlock; + } + + if (folio_isolate_lru(folio)) { + result = SCAN_DEL_PAGE_LRU; + goto out_unlock; + } + + if (!filemap_release_folio(folio, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; + folio_putback_lru(folio); + goto out_unlock; + } + + if (folio_mapped(folio)) + try_to_unmap(folio, + TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); + + xas_lock_irq(&xas); + xas_set(&xas, index); + + VM_BUG_ON_PAGE(page != xas_load(&xas), page); + + /* + * The page is expected to have page_count() == 3: + * - we hold a pin on it; + * - one reference from page cache; + * - one from isolate_lru_page; + */ + if (!page_ref_freeze(page, 3)) { + result = SCAN_PAGE_COUNT; + xas_unlock_irq(&xas); + putback_lru_page(page); + goto out_unlock; + } + + /* + * Add the page to the list to be able to undo the collapse if + * something go wrong. + */ + list_add_tail(&page->lru, &pagelist); + + /* Finally, replace with the new page. */ + xas_store(&xas, hpage); + continue; +out_unlock: + unlock_page(page); + put_page(page); + goto xa_unlocked; + } + nr = thp_nr_pages(hpage); + + if (is_shmem) + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); + else { + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); + filemap_nr_thps_inc(mapping); + /* + * Paired with smp_mb() in do_dentry_open() to ensure + * i_writecount is up to date and the update to nr_thps is + * visible. Ensures the page cache will be truncated if the + * file is opened writable. + */ + smp_mb(); + if (inode_is_open_for_write(mapping->host)) { + result = SCAN_FAIL; + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); + filemap_nr_thps_dec(mapping); + goto xa_locked; + } + } + + if (nr_none) { + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); + } + + /* Join all the small entries into a single multi-index entry */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, hpage); +xa_locked: + xas_unlock_irq(&xas); +xa_unlocked: + + /* + * If collapse is successful, flush must be done now before copying. + * If collapse is unsuccessful, does flush actually need to be done? + * Do it anyway, to clear the state. + */ + try_to_unmap_flush(); + + if (result == SCAN_SUCCEED) { + struct page *page, *tmp; + + /* + * Replacing old pages with new one has succeeded, now we + * need to copy the content and free the old pages. + */ + index = start; + list_for_each_entry_safe(page, tmp, &pagelist, lru) { + while (index < page->index) { + clear_highpage(hpage + (index % HPAGE_PMD_NR)); + index++; + } + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); + list_del(&page->lru); + page->mapping = NULL; + page_ref_unfreeze(page, 1); + ClearPageActive(page); + ClearPageUnevictable(page); + unlock_page(page); + put_page(page); + index++; + } + while (index < end) { + clear_highpage(hpage + (index % HPAGE_PMD_NR)); + index++; + } + + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); + if (is_shmem) + set_page_dirty(hpage); + lru_cache_add(hpage); + + /* + * Remove pte page tables, so we can re-fault the page as huge. + */ + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); + unlock_page(hpage); + hpage = NULL; + } else { + struct page *page; + + /* Something went wrong: roll back page cache changes */ + xas_lock_irq(&xas); + if (nr_none) { + mapping->nrpages -= nr_none; + shmem_uncharge(mapping->host, nr_none); + } + + xas_set(&xas, start); + xas_for_each(&xas, page, end - 1) { + page = list_first_entry_or_null(&pagelist, + struct page, lru); + if (!page || xas.xa_index < page->index) { + if (!nr_none) + break; + nr_none--; + /* Put holes back where they were */ + xas_store(&xas, NULL); + continue; + } + + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); + + /* Unfreeze the page. */ + list_del(&page->lru); + page_ref_unfreeze(page, 2); + xas_store(&xas, page); + xas_pause(&xas); + xas_unlock_irq(&xas); + unlock_page(page); + putback_lru_page(page); + xas_lock_irq(&xas); + } + VM_BUG_ON(nr_none); + xas_unlock_irq(&xas); + + hpage->mapping = NULL; + } + + if (hpage) + unlock_page(hpage); +out: + VM_BUG_ON(!list_empty(&pagelist)); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + /* TODO: tracepoints */ + return result; +} + +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) +{ + struct page *page = NULL; + struct address_space *mapping = file->f_mapping; + XA_STATE(xas, &mapping->i_pages, start); + int present, swap; + int node = NUMA_NO_NODE; + int result = SCAN_SUCCEED; + + present = 0; + swap = 0; + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); + rcu_read_lock(); + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, page)) + continue; + + if (xa_is_value(page)) { + ++swap; + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { + result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); + break; + } + continue; + } + + /* + * TODO: khugepaged should compact smaller compound pages + * into a PMD sized page + */ + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ + break; + } + + node = page_to_nid(page); + if (hpage_collapse_scan_abort(node, cc)) { + result = SCAN_SCAN_ABORT; + break; + } + cc->node_load[node]++; + + if (!PageLRU(page)) { + result = SCAN_PAGE_LRU; + break; + } + + if (page_count(page) != + 1 + page_mapcount(page) + page_has_private(page)) { + result = SCAN_PAGE_COUNT; + break; + } + + /* + * We probably should check if the page is referenced here, but + * nobody would transfer pte_young() to PageReferenced() for us. + * And rmap walk here is just too costly... + */ + + present++; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + + if (result == SCAN_SUCCEED) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + } else { + result = collapse_file(mm, addr, file, start, cc); + } + } + + trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); + return result; +} +#else +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) +{ + BUILD_BUG(); +} + +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) +{ +} + +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + return false; +} +#endif + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) + __releases(&khugepaged_mm_lock) + __acquires(&khugepaged_mm_lock) +{ + struct vma_iterator vmi; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; + + if (khugepaged_scan.mm_slot) { + mm_slot = khugepaged_scan.mm_slot; + slot = &mm_slot->slot; + } else { + slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + khugepaged_collapse_pte_mapped_thps(mm_slot); + + mm = slot->mm; + /* + * Don't wait for semaphore (to avoid long wait times). Just move to + * the next mm on the list. + */ + vma = NULL; + if (unlikely(!mmap_read_trylock(mm))) + goto breakouterloop_mmap_lock; + + progress++; + if (unlikely(hpage_collapse_test_exit(mm))) + goto breakouterloop; + + vma_iter_init(&vmi, mm, khugepaged_scan.address); + for_each_vma(vmi, vma) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(hpage_collapse_test_exit(mm))) { + progress++; + break; + } + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { +skip: + progress++; + continue; + } + hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); + hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); + if (khugepaged_scan.address > hend) + goto skip; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + bool mmap_locked = true; + + cond_resched(); + if (unlikely(hpage_collapse_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, + khugepaged_scan.address); + + mmap_read_unlock(mm); + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, + file, pgoff, cc); + mmap_locked = false; + fput(file); + } else { + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); + } + switch (*result) { + case SCAN_PTE_MAPPED_HUGEPAGE: { + pmd_t *pmd; + + *result = find_pmd_or_thp_or_none(mm, + khugepaged_scan.address, + &pmd); + if (*result != SCAN_SUCCEED) + break; + if (!khugepaged_add_pte_mapped_thp(mm, + khugepaged_scan.address)) + break; + } fallthrough; + case SCAN_SUCCEED: + ++khugepaged_pages_collapsed; + break; + default: + break; + } + + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ + goto breakouterloop_mmap_lock; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_lock: + + spin_lock(&khugepaged_mm_lock); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (hpage_collapse_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (slot->mm_node.next != &khugepaged_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.mm_slot = + mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + hugepage_flags_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + kthread_should_stop(); +} + +static void khugepaged_do_scan(struct collapse_control *cc) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); + bool wait = true; + int result = SCAN_SUCCEED; + + lru_add_drain_all(); + + while (true) { + cond_resched(); + + if (unlikely(kthread_should_stop() || try_to_freeze())) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + &result, cc); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } +} + +static bool khugepaged_should_wakeup(void) +{ + return kthread_should_stop() || + time_after_eq(jiffies, khugepaged_sleep_expire); +} + +static void khugepaged_wait_work(void) +{ + if (khugepaged_has_work()) { + const unsigned long scan_sleep_jiffies = + msecs_to_jiffies(khugepaged_scan_sleep_millisecs); + + if (!scan_sleep_jiffies) + return; + + khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; + wait_event_freezable_timeout(khugepaged_wait, + khugepaged_should_wakeup(), + scan_sleep_jiffies); + return; + } + + if (hugepage_flags_enabled()) + wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); +} + +static int khugepaged(void *none) +{ + struct khugepaged_mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, MAX_NICE); + + while (!kthread_should_stop()) { + khugepaged_do_scan(&khugepaged_collapse_control); + khugepaged_wait_work(); + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + return 0; +} + +static void set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + + if (!hugepage_flags_enabled()) { + calculate_min_free_kbytes(); + goto update_wmarks; + } + + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + + nr_zones++; + } + + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) { + if (user_min_free_kbytes >= 0) + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", + min_free_kbytes, recommended_min); + + min_free_kbytes = recommended_min; + } + +update_wmarks: + setup_per_zone_wmarks(); +} + +int start_stop_khugepaged(void) +{ + int err = 0; + + mutex_lock(&khugepaged_mutex); + if (hugepage_flags_enabled()) { + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (IS_ERR(khugepaged_thread)) { + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + goto fail; + } + + if (!list_empty(&khugepaged_scan.mm_head)) + wake_up_interruptible(&khugepaged_wait); + } else if (khugepaged_thread) { + kthread_stop(khugepaged_thread); + khugepaged_thread = NULL; + } + set_recommended_min_free_kbytes(); +fail: + mutex_unlock(&khugepaged_mutex); + return err; +} + +void khugepaged_min_free_kbytes_update(void) +{ + mutex_lock(&khugepaged_mutex); + if (hugepage_flags_enabled() && khugepaged_thread) + set_recommended_min_free_kbytes(); + mutex_unlock(&khugepaged_mutex); +} + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + *prev = vma; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + + hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + +handle_result: + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + BUG_ON(*prev); + mmap_write_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_write_unlock(mm); + goto handle_result; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} diff --git a/mm/kmemleak.c b/mm/kmemleak.c new file mode 100644 index 000000000..646e29796 --- /dev/null +++ b/mm/kmemleak.c @@ -0,0 +1,2136 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/kmemleak.c + * + * Copyright (C) 2008 ARM Limited + * Written by Catalin Marinas + * + * For more information on the algorithm and kmemleak usage, please see + * Documentation/dev-tools/kmemleak.rst. + * + * Notes on locking + * ---------------- + * + * The following locks and mutexes are used by kmemleak: + * + * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and + * accesses to the object_tree_root (or object_phys_tree_root). The + * object_list is the main list holding the metadata (struct kmemleak_object) + * for the allocated memory blocks. The object_tree_root and object_phys_tree_root + * are red black trees used to look-up metadata based on a pointer to the + * corresponding memory block. The object_phys_tree_root is for objects + * allocated with physical address. The kmemleak_object structures are + * added to the object_list and object_tree_root (or object_phys_tree_root) + * in the create_object() function called from the kmemleak_alloc() (or + * kmemleak_alloc_phys()) callback and removed in delete_object() called from + * the kmemleak_free() callback + * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object. + * Accesses to the metadata (e.g. count) are protected by this lock. Note + * that some members of this structure may be protected by other means + * (atomic or kmemleak_lock). This lock is also held when scanning the + * corresponding memory block to avoid the kernel freeing it via the + * kmemleak_free() callback. This is less heavyweight than holding a global + * lock like kmemleak_lock during scanning. + * - scan_mutex (mutex): ensures that only one thread may scan the memory for + * unreferenced objects at a time. The gray_list contains the objects which + * are already referenced or marked as false positives and need to be + * scanned. This list is only modified during a scanning episode when the + * scan_mutex is held. At the end of a scan, the gray_list is always empty. + * Note that the kmemleak_object.use_count is incremented when an object is + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer + * + * Locks and mutexes are acquired/nested in the following order: + * + * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING) + * + * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex + * regions. + * + * The kmemleak_object structures have a use_count incremented or decremented + * using the get_object()/put_object() functions. When the use_count becomes + * 0, this count can no longer be incremented and put_object() schedules the + * kmemleak_object freeing via an RCU callback. All calls to the get_object() + * function must be protected by rcu_read_lock() to avoid accessing a freed + * structure. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/* + * Kmemleak configuration and common defines. + */ +#define MAX_TRACE 16 /* stack trace length */ +#define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ +#define SECS_FIRST_SCAN 60 /* delay before the first scan */ +#define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ + +#define BYTES_PER_POINTER sizeof(void *) + +/* GFP bitmask for kmemleak internal allocations */ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ + __GFP_NOLOCKDEP)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) + +/* scanning area inside a memory block */ +struct kmemleak_scan_area { + struct hlist_node node; + unsigned long start; + size_t size; +}; + +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + +/* + * Structure holding the metadata for each allocated memory block. + * Modifications to such objects should be made while holding the + * object->lock. Insertions or deletions from object_list, gray_list or + * rb_node are already protected by the corresponding locks or mutex (see + * the notes on locking above). These objects are reference-counted + * (use_count) and freed using the RCU mechanism. + */ +struct kmemleak_object { + raw_spinlock_t lock; + unsigned int flags; /* object status flags */ + struct list_head object_list; + struct list_head gray_list; + struct rb_node rb_node; + struct rcu_head rcu; /* object_list lockless traversal */ + /* object usage count; object freed when use_count == 0 */ + atomic_t use_count; + unsigned long pointer; + size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; + /* minimum number of a pointers found before it is considered leak */ + int min_count; + /* the total number of pointers found pointing to this object */ + int count; + /* checksum for detecting modified objects */ + u32 checksum; + /* memory ranges to be scanned inside an object (empty for all) */ + struct hlist_head area_list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; + unsigned long jiffies; /* creation timestamp */ + pid_t pid; /* pid of the current task */ + char comm[TASK_COMM_LEN]; /* executable name */ +}; + +/* flag representing the memory block allocation status */ +#define OBJECT_ALLOCATED (1 << 0) +/* flag set after the first reporting of an unreference object */ +#define OBJECT_REPORTED (1 << 1) +/* flag set to not scan the object */ +#define OBJECT_NO_SCAN (1 << 2) +/* flag set to fully scan the object when scan_area allocation failed */ +#define OBJECT_FULL_SCAN (1 << 3) +/* flag set for object allocated with physical address */ +#define OBJECT_PHYS (1 << 4) + +#define HEX_PREFIX " " +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 + +/* the list of all allocated objects */ +static LIST_HEAD(object_list); +/* the list of gray-colored objects (see color_gray comment below) */ +static LIST_HEAD(gray_list); +/* memory pool allocation */ +static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE]; +static int mem_pool_free_count = ARRAY_SIZE(mem_pool); +static LIST_HEAD(mem_pool_free_list); +/* search tree for object boundaries */ +static struct rb_root object_tree_root = RB_ROOT; +/* search tree for object (with OBJECT_PHYS flag) boundaries */ +static struct rb_root object_phys_tree_root = RB_ROOT; +/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */ +static DEFINE_RAW_SPINLOCK(kmemleak_lock); + +/* allocation caches for kmemleak internal data */ +static struct kmem_cache *object_cache; +static struct kmem_cache *scan_area_cache; + +/* set if tracing memory operations is enabled */ +static int kmemleak_enabled = 1; +/* same as above but only for the kmemleak_free() callback */ +static int kmemleak_free_enabled = 1; +/* set in the late_initcall if there were no errors */ +static int kmemleak_initialized; +/* set if a kmemleak warning was issued */ +static int kmemleak_warning; +/* set if a fatal kmemleak error has occurred */ +static int kmemleak_error; + +/* minimum and maximum address that may be valid pointers */ +static unsigned long min_addr = ULONG_MAX; +static unsigned long max_addr; + +static struct task_struct *scan_thread; +/* used to avoid reporting of recently allocated objects */ +static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; +/* delay between automatic memory scannings */ +static unsigned long jiffies_scan_wait; +/* enables or disables the task stacks scanning */ +static int kmemleak_stack_scan = 1; +/* protects the memory scanning, parameters and debug/kmemleak file access */ +static DEFINE_MUTEX(scan_mutex); +/* setting kmemleak=on, will set this var, skipping the disable */ +static int kmemleak_skip_disable; +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; + +static bool kmemleak_verbose; +module_param_named(verbose, kmemleak_verbose, bool, 0600); + +static void kmemleak_disable(void); + +/* + * Print a warning and dump the stack trace. + */ +#define kmemleak_warn(x...) do { \ + pr_warn(x); \ + dump_stack(); \ + kmemleak_warning = 1; \ +} while (0) + +/* + * Macro invoked when a serious kmemleak condition occurred and cannot be + * recovered from. Kmemleak will be disabled and further allocation/freeing + * tracing no longer available. + */ +#define kmemleak_stop(x...) do { \ + kmemleak_warn(x); \ + kmemleak_disable(); \ +} while (0) + +#define warn_or_seq_printf(seq, fmt, ...) do { \ + if (seq) \ + seq_printf(seq, fmt, ##__VA_ARGS__); \ + else \ + pr_warn(fmt, ##__VA_ARGS__); \ +} while (0) + +static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type, + int rowsize, int groupsize, const void *buf, + size_t len, bool ascii) +{ + if (seq) + seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize, + buf, len, ascii); + else + print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type, + rowsize, groupsize, buf, len, ascii); +} + +/* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + size_t len; + + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) + return; + + /* limit the number of lines to HEX_MAX_LINES */ + len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); + + warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); + kasan_disable_current(); + warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, + HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII); + kasan_enable_current(); +} + +/* + * Object colors, encoded with count and min_count: + * - white - orphan object, not enough references to it (count < min_count) + * - gray - not orphan, not marked as false positive (min_count == 0) or + * sufficient references to it (count >= min_count) + * - black - ignore, it doesn't contain references (e.g. text section) + * (min_count == -1). No function defined for this color. + * Newly created objects don't have any color assigned (object->count == -1) + * before the next memory scan when they become white. + */ +static bool color_white(const struct kmemleak_object *object) +{ + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; +} + +static bool color_gray(const struct kmemleak_object *object) +{ + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; +} + +/* + * Objects are considered unreferenced only if their color is white, they have + * not be deleted and have a minimum age to avoid false positives caused by + * pointers temporarily stored in CPU registers. + */ +static bool unreferenced_object(struct kmemleak_object *object) +{ + return (color_white(object) && object->flags & OBJECT_ALLOCATED) && + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); +} + +/* + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. + */ +static void print_unreferenced(struct seq_file *seq, + struct kmemleak_object *object) +{ + int i; + unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); + + warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); + hex_dump_object(seq, object); + warn_or_seq_printf(seq, " backtrace:\n"); + + for (i = 0; i < object->trace_len; i++) { + void *ptr = (void *)object->trace[i]; + warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + } +} + +/* + * Print the kmemleak_object information. This function is used mainly for + * debugging special cases when kmemleak operations. It must be called with + * the object->lock held. + */ +static void dump_object_info(struct kmemleak_object *object) +{ + pr_notice("Object 0x%08lx (size %zu):\n", + object->pointer, object->size); + pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + pr_notice(" min_count = %d\n", object->min_count); + pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%x\n", object->flags); + pr_notice(" checksum = %u\n", object->checksum); + pr_notice(" backtrace:\n"); + stack_trace_print(object->trace, object->trace_len, 4); +} + +/* + * Look-up a memory block metadata (kmemleak_object) in the object search + * tree based on a pointer value. If alias is 0, only values pointing to the + * beginning of the memory block are allowed. The kmemleak_lock must be held + * when calling this function. + */ +static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, + bool is_phys) +{ + struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node : + object_tree_root.rb_node; + unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + + while (rb) { + struct kmemleak_object *object; + unsigned long untagged_objp; + + object = rb_entry(rb, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (untagged_ptr < untagged_objp) + rb = object->rb_node.rb_left; + else if (untagged_objp + object->size <= untagged_ptr) + rb = object->rb_node.rb_right; + else if (untagged_objp == untagged_ptr || alias) + return object; + else { + kmemleak_warn("Found object by alias at 0x%08lx\n", + ptr); + dump_object_info(object); + break; + } + } + return NULL; +} + +/* Look-up a kmemleak object which allocated with virtual address. */ +static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) +{ + return __lookup_object(ptr, alias, false); +} + +/* + * Increment the object use_count. Return 1 if successful or 0 otherwise. Note + * that once an object's use_count reached 0, the RCU freeing was already + * registered and the object should no longer be used. This function must be + * called under the protection of rcu_read_lock(). + */ +static int get_object(struct kmemleak_object *object) +{ + return atomic_inc_not_zero(&object->use_count); +} + +/* + * Memory pool allocation and freeing. kmemleak_lock must not be held. + */ +static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + + /* try the slab allocator first */ + if (object_cache) { + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); + if (object) + return object; + } + + /* slab allocation failed, try the memory pool */ + raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = list_first_entry_or_null(&mem_pool_free_list, + typeof(*object), object_list); + if (object) + list_del(&object->object_list); + else if (mem_pool_free_count) + object = &mem_pool[--mem_pool_free_count]; + else + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* + * Return the object to either the slab allocator or the memory pool. + */ +static void mem_pool_free(struct kmemleak_object *object) +{ + unsigned long flags; + + if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { + kmem_cache_free(object_cache, object); + return; + } + + /* add the object to the memory pool free list */ + raw_spin_lock_irqsave(&kmemleak_lock, flags); + list_add(&object->object_list, &mem_pool_free_list); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); +} + +/* + * RCU callback to free a kmemleak_object. + */ +static void free_object_rcu(struct rcu_head *rcu) +{ + struct hlist_node *tmp; + struct kmemleak_scan_area *area; + struct kmemleak_object *object = + container_of(rcu, struct kmemleak_object, rcu); + + /* + * Once use_count is 0 (guaranteed by put_object), there is no other + * code accessing this object, hence no need for locking. + */ + hlist_for_each_entry_safe(area, tmp, &object->area_list, node) { + hlist_del(&area->node); + kmem_cache_free(scan_area_cache, area); + } + mem_pool_free(object); +} + +/* + * Decrement the object use_count. Once the count is 0, free the object using + * an RCU callback. Since put_object() may be called via the kmemleak_free() -> + * delete_object() path, the delayed RCU freeing ensures that there is no + * recursive call to the kernel allocator. Lock-less RCU object_list traversal + * is also possible. + */ +static void put_object(struct kmemleak_object *object) +{ + if (!atomic_dec_and_test(&object->use_count)) + return; + + /* should only get here after delete_object was called */ + WARN_ON(object->flags & OBJECT_ALLOCATED); + + /* + * It may be too early for the RCU callbacks, however, there is no + * concurrent object_list traversal when !object_cache and all objects + * came from the memory pool. Free the object directly. + */ + if (object_cache) + call_rcu(&object->rcu, free_object_rcu); + else + free_object_rcu(&object->rcu); +} + +/* + * Look up an object in the object search tree and increase its use_count. + */ +static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias, + bool is_phys) +{ + unsigned long flags; + struct kmemleak_object *object; + + rcu_read_lock(); + raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = __lookup_object(ptr, alias, is_phys); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + + /* check whether the object is still available */ + if (object && !get_object(object)) + object = NULL; + rcu_read_unlock(); + + return object; +} + +/* Look up and get an object which allocated with virtual address. */ +static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) +{ + return __find_and_get_object(ptr, alias, false); +} + +/* + * Remove an object from the object_tree_root (or object_phys_tree_root) + * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak + * is still enabled. + */ +static void __remove_object(struct kmemleak_object *object) +{ + rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ? + &object_phys_tree_root : + &object_tree_root); + list_del_rcu(&object->object_list); +} + +/* + * Look up an object in the object search tree and remove it from both + * object_tree_root (or object_phys_tree_root) and object_list. The + * returned object's use_count should be at least 1, as initially set + * by create_object(). + */ +static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias, + bool is_phys) +{ + unsigned long flags; + struct kmemleak_object *object; + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = __lookup_object(ptr, alias, is_phys); + if (object) + __remove_object(object); + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + return stack_trace_save(trace, MAX_TRACE, 2); +} + +/* + * Create the metadata (struct kmemleak_object) corresponding to an allocated + * memory block and add it to the object_list and object_tree_root (or + * object_phys_tree_root). + */ +static void __create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp, bool is_phys) +{ + unsigned long flags; + struct kmemleak_object *object, *parent; + struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; + unsigned long untagged_objp; + + object = mem_pool_alloc(gfp); + if (!object) { + pr_warn("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); + return; + } + + INIT_LIST_HEAD(&object->object_list); + INIT_LIST_HEAD(&object->gray_list); + INIT_HLIST_HEAD(&object->area_list); + raw_spin_lock_init(&object->lock); + atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->pointer = ptr; + object->size = kfence_ksize((void *)ptr) ?: size; + object->excess_ref = 0; + object->min_count = min_count; + object->count = 0; /* white color initially */ + object->jiffies = jiffies; + object->checksum = 0; + + /* task information */ + if (in_hardirq()) { + object->pid = 0; + strncpy(object->comm, "hardirq", sizeof(object->comm)); + } else if (in_serving_softirq()) { + object->pid = 0; + strncpy(object->comm, "softirq", sizeof(object->comm)); + } else { + object->pid = current->pid; + /* + * There is a small chance of a race with set_task_comm(), + * however using get_task_comm() here may cause locking + * dependency issues with current->alloc_lock. In the worst + * case, the command line is not correct. + */ + strncpy(object->comm, current->comm, sizeof(object->comm)); + } + + /* kernel backtrace */ + object->trace_len = __save_stack_trace(object->trace); + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + /* + * Only update min_addr and max_addr with object + * storing virtual address. + */ + if (!is_phys) { + min_addr = min(min_addr, untagged_ptr); + max_addr = max(max_addr, untagged_ptr + size); + } + link = is_phys ? &object_phys_tree_root.rb_node : + &object_tree_root.rb_node; + rb_parent = NULL; + while (*link) { + rb_parent = *link; + parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer); + if (untagged_ptr + size <= untagged_objp) + link = &parent->rb_node.rb_left; + else if (untagged_objp + parent->size <= untagged_ptr) + link = &parent->rb_node.rb_right; + else { + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", + ptr); + /* + * No need for parent->lock here since "parent" cannot + * be freed while the kmemleak_lock is held. + */ + dump_object_info(parent); + kmem_cache_free(object_cache, object); + goto out; + } + } + rb_link_node(&object->rb_node, rb_parent, link); + rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : + &object_tree_root); + + list_add_tail_rcu(&object->object_list, &object_list); +out: + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); +} + +/* Create kmemleak object which allocated with virtual address. */ +static void create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + __create_object(ptr, size, min_count, gfp, false); +} + +/* Create kmemleak object which allocated with physical address. */ +static void create_object_phys(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + __create_object(ptr, size, min_count, gfp, true); +} + +/* + * Mark the object as not allocated and schedule RCU freeing via put_object(). + */ +static void __delete_object(struct kmemleak_object *object) +{ + unsigned long flags; + + WARN_ON(!(object->flags & OBJECT_ALLOCATED)); + WARN_ON(atomic_read(&object->use_count) < 1); + + /* + * Locking here also ensures that the corresponding memory block + * cannot be freed when it is being scanned. + */ + raw_spin_lock_irqsave(&object->lock, flags); + object->flags &= ~OBJECT_ALLOCATED; + raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. + */ +static void delete_object_full(unsigned long ptr) +{ + struct kmemleak_object *object; + + object = find_and_remove_object(ptr, 0, false); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif + return; + } + __delete_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. + */ +static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) +{ + struct kmemleak_object *object; + unsigned long start, end; + + object = find_and_remove_object(ptr, 1, is_phys); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); +#endif + return; + } + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + __create_object(start, ptr - start, object->min_count, + GFP_KERNEL, is_phys); + if (ptr + size < end) + __create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL, is_phys); + + __delete_object(object); +} + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&object->lock, flags); + __paint_it(object, color); + raw_spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color, bool is_phys) +{ + struct kmemleak_object *object; + + object = __find_and_get_object(ptr, 0, is_phys); + if (!object) { + kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", + ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); + return; + } + paint_it(object, color); + put_object(object); +} + +/* + * Mark an object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY, false); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr, bool is_phys) +{ + paint_ptr(ptr, KMEMLEAK_BLACK, is_phys); +} + +/* + * Add a scanning area to the object. If at least one such area is added, + * kmemleak will only scan these ranges rather than the whole memory block. + */ +static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) +{ + unsigned long flags; + struct kmemleak_object *object; + struct kmemleak_scan_area *area = NULL; + unsigned long untagged_ptr; + unsigned long untagged_objp; + + object = find_and_get_object(ptr, 1); + if (!object) { + kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", + ptr); + return; + } + + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (scan_area_cache) + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + + raw_spin_lock_irqsave(&object->lock, flags); + if (!area) { + pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); + /* mark the object for full scan to avoid false positives */ + object->flags |= OBJECT_FULL_SCAN; + goto out_unlock; + } + if (size == SIZE_MAX) { + size = untagged_objp + object->size - untagged_ptr; + } else if (untagged_ptr + size > untagged_objp + object->size) { + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); + dump_object_info(object); + kmem_cache_free(scan_area_cache, area); + goto out_unlock; + } + + INIT_HLIST_NODE(&area->node); + area->start = ptr; + area->size = size; + + hlist_add_head(&area->node, &object->area_list); +out_unlock: + raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + raw_spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/* + * Set the OBJECT_NO_SCAN flag for the object corresponding to the give + * pointer. Such object will not be scanned by kmemleak but references to it + * are searched. + */ +static void object_no_scan(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); + return; + } + + raw_spin_lock_irqsave(&object->lock, flags); + object->flags |= OBJECT_NO_SCAN; + raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + +/** + * kmemleak_alloc - register a newly allocated object + * @ptr: pointer to beginning of the object + * @size: size of the object + * @min_count: minimum number of references to this object. If during memory + * scanning a number of references less than @min_count is found, + * the object is reported as a memory leak. If @min_count is 0, + * the object is never reported as a leak. If @min_count is -1, + * the object is ignored (not scanned and not reported as a leak) + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the kernel allocators when a new object + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). + */ +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + create_object((unsigned long)ptr, size, min_count, gfp); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc); + +/** + * kmemleak_alloc_percpu - register a newly allocated __percpu object + * @ptr: __percpu pointer to beginning of the object + * @size: size of the object + * @gfp: flags used for kmemleak internal memory allocations + * + * This function is called from the kernel percpu allocator when a new object + * (memory block) is allocated (alloc_percpu). + */ +void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, + gfp_t gfp) +{ + unsigned int cpu; + + pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size); + + /* + * Percpu allocations are only scanned and not reported as leaks + * (min_count is set to 0). + */ + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + create_object((unsigned long)per_cpu_ptr(ptr, cpu), + size, 0, gfp); +} +EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); + +/** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + +/** + * kmemleak_free - unregister a previously registered object + * @ptr: pointer to beginning of the object + * + * This function is called from the kernel allocators when an object (memory + * block) is freed (kmem_cache_free, kfree, vfree etc.). + */ +void __ref kmemleak_free(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) + delete_object_full((unsigned long)ptr); +} +EXPORT_SYMBOL_GPL(kmemleak_free); + +/** + * kmemleak_free_part - partially unregister a previously registered object + * @ptr: pointer to the beginning or inside the object. This also + * represents the start of the range to be freed + * @size: size to be unregistered + * + * This function is called when only a part of a memory block is freed + * (usually from the bootmem allocator). + */ +void __ref kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size, false); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/** + * kmemleak_free_percpu - unregister a previously registered __percpu object + * @ptr: __percpu pointer to beginning of the object + * + * This function is called from the kernel percpu allocator when an object + * (memory block) is freed (free_percpu). + */ +void __ref kmemleak_free_percpu(const void __percpu *ptr) +{ + unsigned int cpu; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + delete_object_full((unsigned long)per_cpu_ptr(ptr, + cpu)); +} +EXPORT_SYMBOL_GPL(kmemleak_free_percpu); + +/** + * kmemleak_update_trace - update object allocation stack trace + * @ptr: pointer to beginning of the object + * + * Override the object allocation stack trace for cases where the actual + * allocation place is not always useful. + */ +void __ref kmemleak_update_trace(const void *ptr) +{ + struct kmemleak_object *object; + unsigned long flags; + + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) + return; + + object = find_and_get_object((unsigned long)ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Updating stack trace for unknown object at %p\n", + ptr); +#endif + return; + } + + raw_spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); + raw_spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); +} +EXPORT_SYMBOL(kmemleak_update_trace); + +/** + * kmemleak_not_leak - mark an allocated object as false positive + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to no longer + * be reported as leak and always be scanned. + */ +void __ref kmemleak_not_leak(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + make_gray_object((unsigned long)ptr); +} +EXPORT_SYMBOL(kmemleak_not_leak); + +/** + * kmemleak_ignore - ignore an allocated object + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to be + * ignored (not scanned and not reported as a leak). This is usually done when + * it is known that the corresponding block is not a leak and does not contain + * any references to other allocated memory blocks. + */ +void __ref kmemleak_ignore(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + make_black_object((unsigned long)ptr, false); +} +EXPORT_SYMBOL(kmemleak_ignore); + +/** + * kmemleak_scan_area - limit the range to be scanned in an allocated object + * @ptr: pointer to beginning or inside the object. This also + * represents the start of the scan area + * @size: size of the scan area + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is used when it is known that only certain parts of an object + * contain references to other objects. Kmemleak will only scan these areas + * reducing the number false negatives. + */ +void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) + add_scan_area((unsigned long)ptr, size, gfp); +} +EXPORT_SYMBOL(kmemleak_scan_area); + +/** + * kmemleak_no_scan - do not scan an allocated object + * @ptr: pointer to beginning of the object + * + * This function notifies kmemleak not to scan the given memory block. Useful + * in situations where it is known that the given object does not contain any + * references to other objects. Kmemleak will not scan such objects reducing + * the number of false negatives. + */ +void __ref kmemleak_no_scan(const void *ptr) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + object_no_scan((unsigned long)ptr); +} +EXPORT_SYMBOL(kmemleak_no_scan); + +/** + * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical + * address argument + * @phys: physical address of the object + * @size: size of the object + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + */ +void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size); + + if (kmemleak_enabled) + /* + * Create object with OBJECT_PHYS flag and + * assume min_count 0. + */ + create_object_phys((unsigned long)phys, size, 0, gfp); +} +EXPORT_SYMBOL(kmemleak_alloc_phys); + +/** + * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a + * physical address argument + * @phys: physical address if the beginning or inside an object. This + * also represents the start of the range to be freed + * @size: size to be unregistered + */ +void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) +{ + pr_debug("%s(0x%pa)\n", __func__, &phys); + + if (kmemleak_enabled) + delete_object_part((unsigned long)phys, size, true); +} +EXPORT_SYMBOL(kmemleak_free_part_phys); + +/** + * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical + * address argument + * @phys: physical address of the object + */ +void __ref kmemleak_ignore_phys(phys_addr_t phys) +{ + pr_debug("%s(0x%pa)\n", __func__, &phys); + + if (kmemleak_enabled) + make_black_object((unsigned long)phys, true); +} +EXPORT_SYMBOL(kmemleak_ignore_phys); + +/* + * Update an object's checksum and return true if it was modified. + */ +static bool update_checksum(struct kmemleak_object *object) +{ + u32 old_csum = object->checksum; + + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) + return false; + + kasan_disable_current(); + kcsan_disable_current(); + object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); + kasan_enable_current(); + kcsan_enable_current(); + + return object->checksum != old_csum; +} + +/* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + +/* + * Memory scanning is a long process and it needs to be interruptible. This + * function checks whether such interrupt condition occurred. + */ +static int scan_should_stop(void) +{ + if (!kmemleak_enabled) + return 1; + + /* + * This function may be called from either process or kthread context, + * hence the need to check for both stop conditions. + */ + if (current->mm) + return signal_pending(current); + else + return kthread_should_stop(); + + return 0; +} + +/* + * Scan a memory block (exclusive range) for valid pointers and add those + * found to the gray list. + */ +static void scan_block(void *_start, void *_end, + struct kmemleak_object *scanned) +{ + unsigned long *ptr; + unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); + unsigned long *end = _end - (BYTES_PER_POINTER - 1); + unsigned long flags; + unsigned long untagged_ptr; + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + for (ptr = start; ptr < end; ptr++) { + struct kmemleak_object *object; + unsigned long pointer; + unsigned long excess_ref; + + if (scan_should_stop()) + break; + + kasan_disable_current(); + pointer = *(unsigned long *)kasan_reset_tag((void *)ptr); + kasan_enable_current(); + + untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); + if (untagged_ptr < min_addr || untagged_ptr >= max_addr) + continue; + + /* + * No need for get_object() here since we hold kmemleak_lock. + * object->use_count cannot be dropped to 0 while the object + * is still present in object_tree_root and object_list + * (with updates protected by kmemleak_lock). + */ + object = lookup_object(pointer, 1); + if (!object) + continue; + if (object == scanned) + /* self referenced, ignore */ + continue; + + /* + * Avoid the lockdep recursive warning on object->lock being + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + /* only pass surplus references (object already gray) */ + if (color_gray(object)) { + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); + } + raw_spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + raw_spin_unlock(&object->lock); + } + } + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); +} + +/* + * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. + */ +#ifdef CONFIG_SMP +static void scan_large_block(void *start, void *end) +{ + void *next; + + while (start < end) { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, NULL); + start = next; + cond_resched(); + } +} +#endif + +/* + * Scan a memory block corresponding to a kmemleak_object. A condition is + * that object->use_count >= 1. + */ +static void scan_object(struct kmemleak_object *object) +{ + struct kmemleak_scan_area *area; + unsigned long flags; + void *obj_ptr; + + /* + * Once the object->lock is acquired, the corresponding memory block + * cannot be freed (the same lock is acquired in delete_object). + */ + raw_spin_lock_irqsave(&object->lock, flags); + if (object->flags & OBJECT_NO_SCAN) + goto out; + if (!(object->flags & OBJECT_ALLOCATED)) + /* already freed object */ + goto out; + + obj_ptr = object->flags & OBJECT_PHYS ? + __va((phys_addr_t)object->pointer) : + (void *)object->pointer; + + if (hlist_empty(&object->area_list) || + object->flags & OBJECT_FULL_SCAN) { + void *start = obj_ptr; + void *end = obj_ptr + object->size; + void *next; + + do { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, object); + + start = next; + if (start >= end) + break; + + raw_spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + raw_spin_lock_irqsave(&object->lock, flags); + } while (object->flags & OBJECT_ALLOCATED); + } else + hlist_for_each_entry(area, &object->area_list, node) + scan_block((void *)area->start, + (void *)(area->start + area->size), + object); +out: + raw_spin_unlock_irqrestore(&object->lock, flags); +} + +/* + * Scan the objects already referenced (gray objects). More objects will be + * referenced and, if there are no memory leaks, all the objects are scanned. + */ +static void scan_gray_list(void) +{ + struct kmemleak_object *object, *tmp; + + /* + * The list traversal is safe for both tail additions and removals + * from inside the loop. The kmemleak objects cannot be freed from + * outside the loop because their use_count was incremented. + */ + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + cond_resched(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + WARN_ON(!list_empty(&gray_list)); +} + +/* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + +/* + * Scan data sections and all the referenced memory blocks allocated via the + * kernel's standard allocators. This function must be called with the + * scan_mutex held. + */ +static void kmemleak_scan(void) +{ + struct kmemleak_object *object; + struct zone *zone; + int __maybe_unused i; + int new_leaks = 0; + int loop_cnt = 0; + + jiffies_last_scan = jiffies; + + /* prepare the kmemleak_object's */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + bool obj_pinned = false; + + raw_spin_lock_irq(&object->lock); +#ifdef DEBUG + /* + * With a few exceptions there should be a maximum of + * 1 reference to any object at this point. + */ + if (atomic_read(&object->use_count) > 1) { + pr_debug("object->use_count = %d\n", + atomic_read(&object->use_count)); + dump_object_info(object); + } +#endif + + /* ignore objects outside lowmem (paint them black) */ + if ((object->flags & OBJECT_PHYS) && + !(object->flags & OBJECT_NO_SCAN)) { + unsigned long phys = object->pointer; + + if (PHYS_PFN(phys) < min_low_pfn || + PHYS_PFN(phys + object->size) >= max_low_pfn) + __paint_it(object, KMEMLEAK_BLACK); + } + + /* reset the reference count (whiten the object) */ + object->count = 0; + if (color_gray(object) && get_object(object)) { + list_add_tail(&object->gray_list, &gray_list); + obj_pinned = true; + } + + raw_spin_unlock_irq(&object->lock); + + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ + } + rcu_read_unlock(); + +#ifdef CONFIG_SMP + /* per-cpu sections scanning */ + for_each_possible_cpu(i) + scan_large_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i)); +#endif + + /* + * Struct page scanning for each node. + */ + get_online_mems(); + for_each_populated_zone(zone) { + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_online_page(pfn); + + if (!page) + continue; + + /* only scan pages belonging to this zone */ + if (page_zone(page) != zone) + continue; + /* only scan if page is in use */ + if (page_count(page) == 0) + continue; + scan_block(page, page + 1, NULL); + if (!(pfn & 63)) + cond_resched(); + } + } + put_online_mems(); + + /* + * Scanning the task stacks (may introduce false negatives). + */ + if (kmemleak_stack_scan) { + struct task_struct *p, *g; + + rcu_read_lock(); + for_each_process_thread(g, p) { + void *stack = try_get_task_stack(p); + if (stack) { + scan_block(stack, stack + THREAD_SIZE, NULL); + put_task_stack(p); + } + } + rcu_read_unlock(); + } + + /* + * Scan the objects already referenced from the sections scanned + * above. + */ + scan_gray_list(); + + /* + * Check for new or unreferenced objects modified since the previous + * scan and color them gray until the next scan. + */ + rcu_read_lock(); + loop_cnt = 0; + list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* + * This is racy but we can save the overhead of lock/unlock + * calls. The missed objects, if any, should be caught in + * the next scan. + */ + if (!color_white(object)) + continue; + raw_spin_lock_irq(&object->lock); + if (color_white(object) && (object->flags & OBJECT_ALLOCATED) + && update_checksum(object) && get_object(object)) { + /* color it gray temporarily */ + object->count = object->min_count; + list_add_tail(&object->gray_list, &gray_list); + } + raw_spin_unlock_irq(&object->lock); + } + rcu_read_unlock(); + + /* + * Re-scan the gray list for modified unreferenced objects. + */ + scan_gray_list(); + + /* + * If scanning was stopped do not report any new unreferenced objects. + */ + if (scan_should_stop()) + return; + + /* + * Scanning result reporting. + */ + rcu_read_lock(); + loop_cnt = 0; + list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* + * This is racy but we can save the overhead of lock/unlock + * calls. The missed objects, if any, should be caught in + * the next scan. + */ + if (!color_white(object)) + continue; + raw_spin_lock_irq(&object->lock); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + + if (kmemleak_verbose) + print_unreferenced(NULL, object); + + new_leaks++; + } + raw_spin_unlock_irq(&object->lock); + } + rcu_read_unlock(); + + if (new_leaks) { + kmemleak_found_leaks = true; + + pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n", + new_leaks); + } + +} + +/* + * Thread function performing automatic memory scanning. Unreferenced objects + * at the end of a memory scan are reported but only the first time. + */ +static int kmemleak_scan_thread(void *arg) +{ + static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN); + + pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); + + /* + * Wait before the first scan to allow the system to fully initialize. + */ + if (first_run) { + signed long timeout = msecs_to_jiffies(SECS_FIRST_SCAN * 1000); + first_run = 0; + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); + } + + while (!kthread_should_stop()) { + signed long timeout = READ_ONCE(jiffies_scan_wait); + + mutex_lock(&scan_mutex); + kmemleak_scan(); + mutex_unlock(&scan_mutex); + + /* wait before the next scan */ + while (timeout && !kthread_should_stop()) + timeout = schedule_timeout_interruptible(timeout); + } + + pr_info("Automatic memory scanning thread ended\n"); + + return 0; +} + +/* + * Start the automatic memory scanning thread. This function must be called + * with the scan_mutex held. + */ +static void start_scan_thread(void) +{ + if (scan_thread) + return; + scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); + if (IS_ERR(scan_thread)) { + pr_warn("Failed to create the scan thread\n"); + scan_thread = NULL; + } +} + +/* + * Stop the automatic memory scanning thread. + */ +static void stop_scan_thread(void) +{ + if (scan_thread) { + kthread_stop(scan_thread); + scan_thread = NULL; + } +} + +/* + * Iterate over the object_list and return the first valid object at or after + * the required position with its use_count incremented. The function triggers + * a memory scanning when the pos argument points to the first position. + */ +static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct kmemleak_object *object; + loff_t n = *pos; + int err; + + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + if (n-- > 0) + continue; + if (get_object(object)) + goto out; + } + object = NULL; +out: + return object; +} + +/* + * Return the next object in the object_list. The function decrements the + * use_count of the previous object and increases that of the next one. + */ +static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct kmemleak_object *prev_obj = v; + struct kmemleak_object *next_obj = NULL; + struct kmemleak_object *obj = prev_obj; + + ++(*pos); + + list_for_each_entry_continue_rcu(obj, &object_list, object_list) { + if (get_object(obj)) { + next_obj = obj; + break; + } + } + + put_object(prev_obj); + return next_obj; +} + +/* + * Decrement the use_count of the last object required, if any. + */ +static void kmemleak_seq_stop(struct seq_file *seq, void *v) +{ + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } +} + +/* + * Print the information for an unreferenced object to the seq file. + */ +static int kmemleak_seq_show(struct seq_file *seq, void *v) +{ + struct kmemleak_object *object = v; + unsigned long flags; + + raw_spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); + raw_spin_unlock_irqrestore(&object->lock, flags); + return 0; +} + +static const struct seq_operations kmemleak_seq_ops = { + .start = kmemleak_seq_start, + .next = kmemleak_seq_next, + .stop = kmemleak_seq_stop, + .show = kmemleak_seq_show, +}; + +static int kmemleak_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &kmemleak_seq_ops); +} + +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; + + if (kstrtoul(str, 0, &addr)) + return -EINVAL; + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; + } + + raw_spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + raw_spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + raw_spin_lock_irq(&object->lock); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + raw_spin_unlock_irq(&object->lock); + } + rcu_read_unlock(); + + kmemleak_found_leaks = false; +} + +static void __kmemleak_do_cleanup(void); + +/* + * File write operation to configure kmemleak at run-time. The following + * commands can be written to the /sys/kernel/debug/kmemleak file: + * off - disable kmemleak (irreversible) + * stack=on - enable the task stacks scanning + * stack=off - disable the tasks stacks scanning + * scan=on - start the automatic memory scanning thread + * scan=off - stop the automatic memory scanning thread + * scan=... - set the automatic memory scanning period in seconds (0 to + * disable it) + * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. + * dump=... - dump information about the object found at the given address + */ +static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, + size_t size, loff_t *ppos) +{ + char buf[64]; + int buf_size; + int ret; + + buf_size = min(size, (sizeof(buf) - 1)); + if (strncpy_from_user(buf, user_buf, buf_size) < 0) + return -EFAULT; + buf[buf_size] = 0; + + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + + if (strncmp(buf, "clear", 5) == 0) { + if (kmemleak_enabled) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!kmemleak_enabled) { + ret = -EPERM; + goto out; + } + + if (strncmp(buf, "off", 3) == 0) + kmemleak_disable(); + else if (strncmp(buf, "stack=on", 8) == 0) + kmemleak_stack_scan = 1; + else if (strncmp(buf, "stack=off", 9) == 0) + kmemleak_stack_scan = 0; + else if (strncmp(buf, "scan=on", 7) == 0) + start_scan_thread(); + else if (strncmp(buf, "scan=off", 8) == 0) + stop_scan_thread(); + else if (strncmp(buf, "scan=", 5) == 0) { + unsigned secs; + unsigned long msecs; + + ret = kstrtouint(buf + 5, 0, &secs); + if (ret < 0) + goto out; + + msecs = secs * MSEC_PER_SEC; + if (msecs > UINT_MAX) + msecs = UINT_MAX; + + stop_scan_thread(); + if (msecs) { + WRITE_ONCE(jiffies_scan_wait, msecs_to_jiffies(msecs)); + start_scan_thread(); + } + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); + else + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; + + /* ignore the rest of the buffer, only one command at a time */ + *ppos += size; + return size; +} + +static const struct file_operations kmemleak_fops = { + .owner = THIS_MODULE, + .open = kmemleak_open, + .read = seq_read, + .write = kmemleak_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object, *tmp; + + /* + * Kmemleak has already been disabled, no need for RCU list traversal + * or kmemleak_lock held. + */ + list_for_each_entry_safe(object, tmp, &object_list, object_list) { + __remove_object(object); + __delete_object(object); + } +} + +/* + * Stop the memory scanning thread and free the kmemleak internal objects if + * no previous scan thread (otherwise, kmemleak may still have some useful + * information on memory leaks). + */ +static void kmemleak_do_cleanup(struct work_struct *work) +{ + stop_scan_thread(); + + mutex_lock(&scan_mutex); + /* + * Once it is made sure that kmemleak_scan has stopped, it is safe to no + * longer track object freeing. Ordering of the scan thread stopping and + * the memory accesses below is guaranteed by the kthread_stop() + * function. + */ + kmemleak_free_enabled = 0; + mutex_unlock(&scan_mutex); + + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n"); +} + +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); + +/* + * Disable kmemleak. No memory allocation/freeing will be traced once this + * function is called. Disabling kmemleak is an irreversible operation. + */ +static void kmemleak_disable(void) +{ + /* atomically check whether it was already invoked */ + if (cmpxchg(&kmemleak_error, 0, 1)) + return; + + /* stop any memory operation tracing */ + kmemleak_enabled = 0; + + /* check whether it is too early for a kernel thread */ + if (kmemleak_initialized) + schedule_work(&cleanup_work); + else + kmemleak_free_enabled = 0; + + pr_info("Kernel memory leak detector disabled\n"); +} + +/* + * Allow boot-time kmemleak disabling (enabled by default). + */ +static int __init kmemleak_boot_config(char *str) +{ + if (!str) + return -EINVAL; + if (strcmp(str, "off") == 0) + kmemleak_disable(); + else if (strcmp(str, "on") == 0) + kmemleak_skip_disable = 1; + else + return -EINVAL; + return 0; +} +early_param("kmemleak", kmemleak_boot_config); + +/* + * Kmemleak initialization. + */ +void __init kmemleak_init(void) +{ +#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF + if (!kmemleak_skip_disable) { + kmemleak_disable(); + return; + } +#endif + + if (kmemleak_error) + return; + + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); + jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); + + object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); + scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); + + /* register the data/bss sections */ + create_object((unsigned long)_sdata, _edata - _sdata, + KMEMLEAK_GREY, GFP_ATOMIC); + create_object((unsigned long)__bss_start, __bss_stop - __bss_start, + KMEMLEAK_GREY, GFP_ATOMIC); + /* only register .data..ro_after_init if not within .data */ + if (&__start_ro_after_init < &_sdata || &__end_ro_after_init > &_edata) + create_object((unsigned long)__start_ro_after_init, + __end_ro_after_init - __start_ro_after_init, + KMEMLEAK_GREY, GFP_ATOMIC); +} + +/* + * Late initialization function. + */ +static int __init kmemleak_late_init(void) +{ + kmemleak_initialized = 1; + + debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops); + + if (kmemleak_error) { + /* + * Some error occurred and kmemleak was disabled. There is a + * small chance that kmemleak_disable() was called immediately + * after setting kmemleak_initialized and we may end up with + * two clean-up threads but serialized by scan_mutex. + */ + schedule_work(&cleanup_work); + return -ENOMEM; + } + + if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) { + mutex_lock(&scan_mutex); + start_scan_thread(); + mutex_unlock(&scan_mutex); + } + + pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n", + mem_pool_free_count); + + return 0; +} +late_initcall(kmemleak_late_init); diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile new file mode 100644 index 000000000..98eab2856 --- /dev/null +++ b/mm/kmsan/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for KernelMemorySanitizer (KMSAN). +# +# +obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o + +KMSAN_SANITIZE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n + +# Disable instrumentation of KMSAN runtime with other tools. +CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector +CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) +CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) + +obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o +KMSAN_SANITIZE_kmsan_test.o := y +CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c new file mode 100644 index 000000000..112dce135 --- /dev/null +++ b/mm/kmsan/core.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN runtime library. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kmsan.h" + +bool kmsan_enabled __read_mostly; + +/* + * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is + * unavaliable. + */ +DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +void kmsan_internal_task_create(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + struct thread_info *info = current_thread_info(); + + __memset(ctx, 0, sizeof(*ctx)); + ctx->allow_reporting = true; + kmsan_internal_unpoison_memory(info, sizeof(*info), false); +} + +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags) +{ + u32 extra_bits = + kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE); + bool checked = poison_flags & KMSAN_POISON_CHECK; + depot_stack_handle_t handle; + + handle = kmsan_save_stack_with_flags(flags, extra_bits); + kmsan_internal_set_shadow_origin(address, size, -1, handle, checked); +} + +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked) +{ + kmsan_internal_set_shadow_origin(address, size, 0, 0, checked); +} + +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra) +{ + unsigned long entries[KMSAN_STACK_DEPTH]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); + + /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ + flags &= ~__GFP_DIRECT_RECLAIM; + + return __stack_depot_save(entries, nr_entries, extra, flags, true); +} + +/* Copy the metadata following the memmove() behavior. */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) +{ + depot_stack_handle_t old_origin = 0, new_origin = 0; + int src_slots, dst_slots, i, iter, step, skip_bits; + depot_stack_handle_t *origin_src, *origin_dst; + void *shadow_src, *shadow_dst; + u32 *align_shadow_src, shadow; + bool backwards; + + shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); + if (!shadow_dst) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + + shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); + if (!shadow_src) { + /* + * @src is untracked: zero out destination shadow, ignore the + * origins, we're done. + */ + __memset(shadow_dst, 0, n); + return; + } + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); + + __memmove(shadow_dst, shadow_src, n); + + origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); + origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin_dst || !origin_src); + src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); + KMSAN_WARN_ON((src_slots - dst_slots > 1) || + (dst_slots - src_slots < -1)); + + backwards = dst > src; + i = backwards ? min(src_slots, dst_slots) - 1 : 0; + iter = backwards ? -1 : 1; + + align_shadow_src = + (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); + for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { + KMSAN_WARN_ON(i < 0); + shadow = align_shadow_src[i]; + if (i == 0) { + /* + * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't + * look at the first @src % KMSAN_ORIGIN_SIZE bytes + * of the first shadow slot. + */ + skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + } + if (i == src_slots - 1) { + /* + * If @src + n isn't aligned on + * KMSAN_ORIGIN_SIZE, don't look at the last + * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the + * last shadow slot. + */ + skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + } + /* + * Overwrite the origin only if the corresponding + * shadow is nonempty. + */ + if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { + old_origin = origin_src[i]; + new_origin = kmsan_internal_chain_origin(old_origin); + /* + * kmsan_internal_chain_origin() may return + * NULL, but we don't want to lose the previous + * origin value. + */ + if (!new_origin) + new_origin = old_origin; + } + if (shadow) + origin_dst[i] = new_origin; + else + origin_dst[i] = 0; + } + /* + * If dst_slots is greater than src_slots (i.e. + * dst_slots == src_slots + 1), there is an extra origin slot at the + * beginning or end of the destination buffer, for which we take the + * origin from the previous slot. + * This is only done if the part of the source shadow corresponding to + * slot is non-zero. + * + * E.g. if we copy 8 aligned bytes that are marked as uninitialized + * and have origins o111 and o222, to an unaligned buffer with offset 1, + * these two origins are copied to three origin slots, so one of then + * needs to be duplicated, depending on the copy direction (@backwards) + * + * src shadow: |uuuu|uuuu|....| + * src origin: |o111|o222|....| + * + * backwards = 0: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |....|o111|o222| - fill the empty slot with o111 + * backwards = 1: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |o111|o222|....| - fill the empty slot with o222 + */ + if (src_slots < dst_slots) { + if (backwards) { + shadow = align_shadow_src[src_slots - 1]; + skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + if (shadow) + /* src_slots > 0, therefore dst_slots is at least 2 */ + origin_dst[dst_slots - 1] = + origin_dst[dst_slots - 2]; + } else { + shadow = align_shadow_src[0]; + skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + if (shadow) + origin_dst[0] = origin_dst[1]; + } + } +} + +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) +{ + unsigned long entries[3]; + u32 extra_bits; + int depth; + bool uaf; + + if (!id) + return id; + /* + * Make sure we have enough spare bits in @id to hold the UAF bit and + * the chain depth. + */ + BUILD_BUG_ON( + (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + + extra_bits = stack_depot_get_extra_bits(id); + depth = kmsan_depth_from_eb(extra_bits); + uaf = kmsan_uaf_from_eb(extra_bits); + + /* + * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH. + * This mostly happens in the case structures with uninitialized padding + * are copied around many times. Origin chains for such structures are + * usually periodic, and it does not make sense to fully store them. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + return id; + + depth++; + extra_bits = kmsan_extra_bits(depth, uaf); + + entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN; + entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0); + entries[2] = id; + /* + * @entries is a local var in non-instrumented code, so KMSAN does not + * know it is initialized. Explicitly unpoison it to avoid false + * positives when __stack_depot_save() passes it to instrumented code. + */ + kmsan_internal_unpoison_memory(entries, sizeof(entries), false); + return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, + GFP_ATOMIC, true); +} + +void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, + u32 origin, bool checked) +{ + u64 address = (u64)addr; + void *shadow_start; + u32 *origin_start; + size_t pad = 0; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW); + if (!shadow_start) { + /* + * kmsan_metadata_is_contiguous() is true, so either all shadow + * and origin pages are NULL, or all are non-NULL. + */ + if (checked) { + pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n", + __func__, size, addr); + KMSAN_WARN_ON(true); + } + return; + } + __memset(shadow_start, b, size); + + if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + pad = address % KMSAN_ORIGIN_SIZE; + address -= pad; + size += pad; + } + size = ALIGN(size, KMSAN_ORIGIN_SIZE); + origin_start = + (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN); + + for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) + origin_start[i] = origin; +} + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr) +{ + struct page *page; + + if (!kmsan_internal_is_vmalloc_addr(vaddr) && + !kmsan_internal_is_module_addr(vaddr)) + return NULL; + page = vmalloc_to_page(vaddr); + if (pfn_valid(page_to_pfn(page))) + return page; + else + return NULL; +} + +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason) +{ + depot_stack_handle_t cur_origin = 0, new_origin = 0; + unsigned long addr64 = (unsigned long)addr; + depot_stack_handle_t *origin = NULL; + unsigned char *shadow = NULL; + int cur_off_start = -1; + int chunk_size; + size_t pos = 0; + + if (!size) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + while (pos < size) { + chunk_size = min(size - pos, + PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE)); + shadow = kmsan_get_metadata((void *)(addr64 + pos), + KMSAN_META_SHADOW); + if (!shadow) { + /* + * This page is untracked. If there were uninitialized + * bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos - 1, user_addr, + reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + pos += chunk_size; + continue; + } + for (int i = 0; i < chunk_size; i++) { + if (!shadow[i]) { + /* + * This byte is unpoisoned. If there were + * poisoned bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + continue; + } + origin = kmsan_get_metadata((void *)(addr64 + pos + i), + KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin); + new_origin = *origin; + /* + * Encountered new origin - report the previous + * uninitialized range. + */ + if (cur_origin != new_origin) { + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = new_origin; + cur_off_start = pos + i; + } + } + pos += chunk_size; + } + KMSAN_WARN_ON(pos != size); + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, + user_addr, reason); + kmsan_leave_runtime(); + } +} + +bool kmsan_metadata_is_contiguous(void *addr, size_t size) +{ + char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL, + *next_origin = NULL; + u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE; + depot_stack_handle_t *origin_p; + bool all_untracked = false; + + if (!size) + return true; + + /* The whole range belongs to the same page. */ + if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) == + ALIGN_DOWN(cur_addr, PAGE_SIZE)) + return true; + + cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false); + if (!cur_shadow) + all_untracked = true; + cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true); + if (all_untracked && cur_origin) + goto report; + + for (; next_addr < (u64)addr + size; + cur_addr = next_addr, cur_shadow = next_shadow, + cur_origin = next_origin, next_addr += PAGE_SIZE) { + next_shadow = kmsan_get_metadata((void *)next_addr, false); + next_origin = kmsan_get_metadata((void *)next_addr, true); + if (all_untracked) { + if (next_shadow || next_origin) + goto report; + if (!next_shadow && !next_origin) + continue; + } + if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) && + ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE))) + continue; + goto report; + } + return true; + +report: + pr_err("%s: attempting to access two shadow page ranges.\n", __func__); + pr_err("Access of size %ld at %px.\n", size, addr); + pr_err("Addresses belonging to different ranges: %px and %px\n", + (void *)cur_addr, (void *)next_addr); + pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow, + next_shadow); + pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin, + next_origin); + origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN); + if (origin_p) { + pr_err("Origin: %08x\n", *origin_p); + kmsan_print_origin(*origin_p); + } else { + pr_err("Origin: unavailable\n"); + } + return false; +} diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c new file mode 100644 index 000000000..ec0da72e6 --- /dev/null +++ b/mm/kmsan/hooks.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN hooks for kernel subsystems. + * + * These functions handle creation of KMSAN metadata for memory allocations. + * + * Copyright (C) 2018-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "../slab.h" +#include "kmsan.h" + +/* + * Instrumented functions shouldn't be called under + * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to + * skipping effects of functions like memset() inside instrumented code. + */ + +void kmsan_task_create(struct task_struct *task) +{ + kmsan_enter_runtime(); + kmsan_internal_task_create(task); + kmsan_leave_runtime(); +} + +void kmsan_task_exit(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ctx->allow_reporting = false; +} + +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (unlikely(object == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * There's a ctor or this is an RCU cache - do nothing. The memory + * status hasn't changed since last use. + */ + if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU)) + return; + + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory(object, s->object_size, + KMSAN_POISON_CHECK); + else + kmsan_internal_poison_memory(object, s->object_size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_slab_free(struct kmem_cache *s, void *object) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))) + return; + /* + * If there's a constructor, freed memory must remain in the same state + * until the next allocation. We cannot save its state to detect + * use-after-free bugs, instead we just keep it unpoisoned. + */ + if (s->ctor) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +{ + if (unlikely(ptr == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory((void *)ptr, size, + /*checked*/ true); + else + kmsan_internal_poison_memory((void *)ptr, size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_kfree_large(const void *ptr) +{ + struct page *page; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + page = virt_to_head_page((void *)ptr); + KMSAN_WARN_ON(ptr != page_address(page)); + kmsan_internal_poison_memory((void *)ptr, + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +static unsigned long vmalloc_shadow(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_SHADOW); +} + +static unsigned long vmalloc_origin(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_ORIGIN); +} + +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) +{ + __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end)); + __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end)); + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); +} + +/* + * This function creates new shadow/origin pages for the physical pages mapped + * into the virtual memory. If those physical pages already had shadow/origin, + * those are ignored. + */ +int kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) +{ + gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; + struct page *shadow, *origin; + unsigned long off = 0; + int nr, err = 0, clean = 0, mapped; + + if (!kmsan_enabled || kmsan_in_runtime()) + return 0; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + for (int i = 0; i < nr; i++, off += PAGE_SIZE, clean = i) { + shadow = alloc_pages(gfp_mask, 1); + origin = alloc_pages(gfp_mask, 1); + if (!shadow || !origin) { + err = -ENOMEM; + goto ret; + } + mapped = __vmap_pages_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, + PAGE_SHIFT); + if (mapped) { + err = mapped; + goto ret; + } + shadow = NULL; + mapped = __vmap_pages_range_noflush( + vmalloc_origin(start + off), + vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, + PAGE_SHIFT); + if (mapped) { + __vunmap_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE)); + err = mapped; + goto ret; + } + origin = NULL; + } + /* Page mapping loop finished normally, nothing to clean up. */ + clean = 0; + +ret: + if (clean > 0) { + /* + * Something went wrong. Clean up shadow/origin pages allocated + * on the last loop iteration, then delete mappings created + * during the previous iterations. + */ + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + __vunmap_range_noflush( + vmalloc_shadow(start), + vmalloc_shadow(start + clean * PAGE_SIZE)); + __vunmap_range_noflush( + vmalloc_origin(start), + vmalloc_origin(start + clean * PAGE_SIZE)); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); + return err; +} + +void kmsan_iounmap_page_range(unsigned long start, unsigned long end) +{ + unsigned long v_shadow, v_origin; + struct page *shadow, *origin; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + v_shadow = (unsigned long)vmalloc_shadow(start); + v_origin = (unsigned long)vmalloc_origin(start); + for (int i = 0; i < nr; + i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) { + shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow); + origin = kmsan_vmalloc_to_page_or_null((void *)v_origin); + __vunmap_range_noflush(v_shadow, vmalloc_shadow(end)); + __vunmap_range_noflush(v_origin, vmalloc_origin(end)); + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * At this point we've copied the memory already. It's hard to check it + * before copying, as the size of actually copied buffer is unknown. + */ + + /* copy_to_user() may copy zero bytes. No need to check. */ + if (!to_copy) + return; + /* Or maybe copy_to_user() failed to copy anything. */ + if (to_copy <= left) + return; + + ua_flags = user_access_save(); + if ((u64)to < TASK_SIZE) { + /* This is a user memory access, check it. */ + kmsan_internal_check_memory((void *)from, to_copy - left, to, + REASON_COPY_TO_USER); + } else { + /* Otherwise this is a kernel memory access. This happens when a + * compat syscall passes an argument allocated on the kernel + * stack to a real syscall. + * Don't check anything, just copy the shadow of the copied + * bytes. + */ + kmsan_internal_memmove_metadata((void *)to, (void *)from, + to_copy - left); + } + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_copy_to_user); + +/* Helper function to check an URB. */ +void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ + if (!urb) + return; + if (is_out) + kmsan_internal_check_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*user_addr*/ 0, REASON_SUBMIT_URB); + else + kmsan_internal_unpoison_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*checked*/ false); +} +EXPORT_SYMBOL_GPL(kmsan_handle_urb); + +static void kmsan_handle_dma_page(const void *addr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_TO_DEVICE: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + break; + case DMA_FROM_DEVICE: + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_NONE: + break; + } +} + +/* Helper function to handle DMA data transfers. */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ + u64 page_offset, to_go, addr; + + if (PageHighMem(page)) + return; + addr = (u64)page_address(page) + offset; + /* + * The kernel may occasionally give us adjacent DMA pages not belonging + * to the same allocation. Process them separately to avoid triggering + * internal KMSAN checks. + */ + while (size > 0) { + page_offset = addr % PAGE_SIZE; + to_go = min(PAGE_SIZE - page_offset, (u64)size); + kmsan_handle_dma_page((void *)addr, to_go, dir); + addr += to_go; + size -= to_go; + } +} + +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + struct scatterlist *item; + int i; + + for_each_sg(sg, item, nents, i) + kmsan_handle_dma(sg_page(item), item->offset, item->length, + dir); +} + +/* Functions from kmsan-checks.h follow. */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_poison_memory((void *)address, size, flags, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_poison_memory); + +void kmsan_unpoison_memory(const void *address, size_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_unpoison_memory((void *)address, size, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_unpoison_memory); + +/* + * Version of kmsan_unpoison_memory() that can be called from within the KMSAN + * runtime. + * + * Non-instrumented IRQ entry functions receive struct pt_regs from assembly + * code. Those regs need to be unpoisoned, otherwise using them will result in + * false positives. + * Using kmsan_unpoison_memory() is not an option in entry code, because the + * return value of in_task() is inconsistent - as a result, certain calls to + * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that + * the registers are unpoisoned even if kmsan_in_runtime() is true in the early + * entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ + unsigned long ua_flags; + + if (!kmsan_enabled) + return; + + ua_flags = user_access_save(); + kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), + KMSAN_POISON_NOCHECK); + user_access_restore(ua_flags); +} + +void kmsan_check_memory(const void *addr, size_t size) +{ + if (!kmsan_enabled) + return; + return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); +} +EXPORT_SYMBOL(kmsan_check_memory); diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c new file mode 100644 index 000000000..7fb794242 --- /dev/null +++ b/mm/kmsan/init.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN initialization routines. + * + * Copyright (C) 2017-2021 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" + +#include +#include +#include + +#include "../internal.h" + +#define NUM_FUTURE_RANGES 128 +struct start_end_pair { + u64 start, end; +}; + +static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata; +static int future_index __initdata; + +/* + * Record a range of memory for which the metadata pages will be created once + * the page allocator becomes available. + */ +static void __init kmsan_record_future_shadow_range(void *start, void *end) +{ + u64 nstart = (u64)start, nend = (u64)end, cstart, cend; + bool merged = false; + + KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); + KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend); + nstart = ALIGN_DOWN(nstart, PAGE_SIZE); + nend = ALIGN(nend, PAGE_SIZE); + + /* + * Scan the existing ranges to see if any of them overlaps with + * [start, end). In that case, merge the two ranges instead of + * creating a new one. + * The number of ranges is less than 20, so there is no need to organize + * them into a more intelligent data structure. + */ + for (int i = 0; i < future_index; i++) { + cstart = start_end_pairs[i].start; + cend = start_end_pairs[i].end; + if ((cstart < nstart && cend < nstart) || + (cstart > nend && cend > nend)) + /* ranges are disjoint - do not merge */ + continue; + start_end_pairs[i].start = min(nstart, cstart); + start_end_pairs[i].end = max(nend, cend); + merged = true; + break; + } + if (merged) + return; + start_end_pairs[future_index].start = nstart; + start_end_pairs[future_index].end = nend; + future_index++; +} + +/* + * Initialize the shadow for existing mappings during kernel initialization. + * These include kernel text/data sections, NODE_DATA and future ranges + * registered while creating other data (e.g. percpu). + * + * Allocations via memblock can be only done before slab is initialized. + */ +void __init kmsan_init_shadow(void) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + phys_addr_t p_start, p_end; + u64 loop; + int nid; + + for_each_reserved_mem_range(loop, &p_start, &p_end) + kmsan_record_future_shadow_range(phys_to_virt(p_start), + phys_to_virt(p_end)); + /* Allocate shadow for .data */ + kmsan_record_future_shadow_range(_sdata, _edata); + + for_each_online_node(nid) + kmsan_record_future_shadow_range( + NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size); + + for (int i = 0; i < future_index; i++) + kmsan_init_alloc_meta_for_range( + (void *)start_end_pairs[i].start, + (void *)start_end_pairs[i].end); +} + +struct metadata_page_pair { + struct page *shadow, *origin; +}; +static struct metadata_page_pair held_back[MAX_ORDER] __initdata; + +/* + * Eager metadata allocation. When the memblock allocator is freeing pages to + * pagealloc, we use 2/3 of them as metadata for the remaining 1/3. + * We store the pointers to the returned blocks of pages in held_back[] grouped + * by their order: when kmsan_memblock_free_pages() is called for the first + * time with a certain order, it is reserved as a shadow block, for the second + * time - as an origin block. On the third time the incoming block receives its + * shadow and origin ranges from the previously saved shadow and origin blocks, + * after which held_back[order] can be used again. + * + * At the very end there may be leftover blocks in held_back[]. They are + * collected later by kmsan_memblock_discard(). + */ +bool kmsan_memblock_free_pages(struct page *page, unsigned int order) +{ + struct page *shadow, *origin; + + if (!held_back[order].shadow) { + held_back[order].shadow = page; + return false; + } + if (!held_back[order].origin) { + held_back[order].origin = page; + return false; + } + shadow = held_back[order].shadow; + origin = held_back[order].origin; + kmsan_setup_meta(page, shadow, origin, order); + + held_back[order].shadow = NULL; + held_back[order].origin = NULL; + return true; +} + +#define MAX_BLOCKS 8 +struct smallstack { + struct page *items[MAX_BLOCKS]; + int index; + int order; +}; + +static struct smallstack collect = { + .index = 0, + .order = MAX_ORDER, +}; + +static void smallstack_push(struct smallstack *stack, struct page *pages) +{ + KMSAN_WARN_ON(stack->index == MAX_BLOCKS); + stack->items[stack->index] = pages; + stack->index++; +} +#undef MAX_BLOCKS + +static struct page *smallstack_pop(struct smallstack *stack) +{ + struct page *ret; + + KMSAN_WARN_ON(stack->index == 0); + stack->index--; + ret = stack->items[stack->index]; + stack->items[stack->index] = NULL; + return ret; +} + +static void do_collection(void) +{ + struct page *page, *shadow, *origin; + + while (collect.index >= 3) { + page = smallstack_pop(&collect); + shadow = smallstack_pop(&collect); + origin = smallstack_pop(&collect); + kmsan_setup_meta(page, shadow, origin, collect.order); + __free_pages_core(page, collect.order); + } +} + +static void collect_split(void) +{ + struct smallstack tmp = { + .order = collect.order - 1, + .index = 0, + }; + struct page *page; + + if (!collect.order) + return; + while (collect.index) { + page = smallstack_pop(&collect); + smallstack_push(&tmp, &page[0]); + smallstack_push(&tmp, &page[1 << tmp.order]); + } + __memcpy(&collect, &tmp, sizeof(tmp)); +} + +/* + * Memblock is about to go away. Split the page blocks left over in held_back[] + * and return 1/3 of that memory to the system. + */ +static void kmsan_memblock_discard(void) +{ + /* + * For each order=N: + * - push held_back[N].shadow and .origin to @collect; + * - while there are >= 3 elements in @collect, do garbage collection: + * - pop 3 ranges from @collect; + * - use two of them as shadow and origin for the third one; + * - repeat; + * - split each remaining element from @collect into 2 ranges of + * order=N-1, + * - repeat. + */ + collect.order = MAX_ORDER - 1; + for (int i = MAX_ORDER - 1; i >= 0; i--) { + if (held_back[i].shadow) + smallstack_push(&collect, held_back[i].shadow); + if (held_back[i].origin) + smallstack_push(&collect, held_back[i].origin); + held_back[i].shadow = NULL; + held_back[i].origin = NULL; + do_collection(); + collect_split(); + } +} + +void __init kmsan_init_runtime(void) +{ + /* Assuming current is init_task */ + kmsan_internal_task_create(current); + kmsan_memblock_discard(); + pr_info("Starting KernelMemorySanitizer\n"); + pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n"); + kmsan_enabled = true; +} diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c new file mode 100644 index 000000000..271f135f9 --- /dev/null +++ b/mm/kmsan/instrumentation.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN compiler API. + * + * This file implements __msan_XXX hooks that Clang inserts into the code + * compiled with -fsanitize=kernel-memory. + * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN + * instrumentation works. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" +#include +#include +#include +#include + +static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store) +{ + if ((u64)addr < TASK_SIZE) + return true; + if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW)) + return true; + return false; +} + +static inline struct shadow_origin_ptr +get_shadow_origin_ptr(void *addr, u64 size, bool store) +{ + unsigned long ua_flags = user_access_save(); + struct shadow_origin_ptr ret; + + ret = kmsan_get_shadow_origin_ptr(addr, size, store); + user_access_restore(ua_flags); + return ret; +} + +/* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ false); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); + +/* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ true); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); + +/* + * Declare functions that obtain shadow/origin pointers for loads and stores + * with fixed size. + */ +#define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ false); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ true); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size) + +DECLARE_METADATA_PTR_GETTER(1); +DECLARE_METADATA_PTR_GETTER(2); +DECLARE_METADATA_PTR_GETTER(4); +DECLARE_METADATA_PTR_GETTER(8); + +/* + * Handle a memory store performed by inline assembly. KMSAN conservatively + * attempts to unpoison the outputs of asm() directives to prevent false + * positives caused by missed stores. + */ +void __msan_instrument_asm_store(void *addr, uintptr_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + /* + * Most of the accesses are below 32 bytes. The two exceptions so far + * are clwb() (64 bytes) and FPU state (512 bytes). + * It's unlikely that the assembly will touch more than 512 bytes. + */ + if (size > 512) { + WARN_ONCE(1, "assembly store size too big: %ld\n", size); + size = 8; + } + if (is_bad_asm_addr(addr, size, /*is_store*/ true)) { + user_access_restore(ua_flags); + return; + } + kmsan_enter_runtime(); + /* Unpoisoning the memory on best effort. */ + kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_instrument_asm_store); + +/* + * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset + * intrinsics with calls to respective __msan_ functions. We use + * get_param0_metadata() and set_retval_metadata() to store the shadow/origin + * values for the destination argument of these functions and use them for the + * functions' return values. + */ +static inline void get_param0_metadata(u64 *shadow, + depot_stack_handle_t *origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *shadow = *(u64 *)(ctx->cstate.param_tls); + *origin = ctx->cstate.param_origin_tls[0]; +} + +static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *(u64 *)(ctx->cstate.retval_tls) = shadow; + ctx->cstate.retval_origin_tls = origin; +} + +/* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memmove(dst, src, n); + if (!n) + /* Some people call memmove() with zero length. */ + return result; + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memmove); + +/* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memcpy(dst, src, n); + if (!n) + /* Some people call memcpy() with zero length. */ + return result; + + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* Using memmove instead of memcpy doesn't affect correctness. */ + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memcpy); + +/* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memset(dst, c, n); + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* + * Clang doesn't pass parameter metadata here, so it is impossible to + * use shadow of @c to set up the shadow for @dst. + */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memset); + +/* + * Create a new origin from an old one. This is done when storing an + * uninitialized value to memory. When reporting an error, KMSAN unrolls and + * prints the whole chain of stores that preceded the use of this value. + */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) +{ + depot_stack_handle_t ret = 0; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return ret; + + ua_flags = user_access_save(); + + /* Creating new origins may allocate memory. */ + kmsan_enter_runtime(); + ret = kmsan_internal_chain_origin(origin); + kmsan_leave_runtime(); + user_access_restore(ua_flags); + return ret; +} +EXPORT_SYMBOL(__msan_chain_origin); + +/* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr) +{ + depot_stack_handle_t handle; + unsigned long entries[4]; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN; + entries[1] = (u64)descr; + entries[2] = (u64)__builtin_return_address(0); + /* + * With frame pointers enabled, it is possible to quickly fetch the + * second frame of the caller stack without calling the unwinder. + * Without them, simply do not bother. + */ + if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) + entries[3] = (u64)__builtin_return_address(1); + else + entries[3] = 0; + + /* stack_depot_save() may allocate memory. */ + kmsan_enter_runtime(); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC); + kmsan_leave_runtime(); + + kmsan_internal_set_shadow_origin(address, size, -1, handle, + /*checked*/ true); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_poison_alloca); + +/* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_unpoison_memory(address, size, /*checked*/ true); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_unpoison_alloca); + +/* + * Report that an uninitialized value with the given origin was used in a way + * that constituted undefined behavior. + */ +void __msan_warning(u32 origin) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_report(origin, /*address*/ 0, /*size*/ 0, + /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0, + REASON_ANY); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_warning); + +/* + * At the beginning of an instrumented function, obtain the pointer to + * `struct kmsan_context_state` holding the metadata for function parameters. + */ +struct kmsan_context_state *__msan_get_context_state(void) +{ + return &kmsan_get_context()->cstate; +} +EXPORT_SYMBOL(__msan_get_context_state); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h new file mode 100644 index 000000000..a14744205 --- /dev/null +++ b/mm/kmsan/kmsan.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions used by the KMSAN runtime. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef __MM_KMSAN_KMSAN_H +#define __MM_KMSAN_KMSAN_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100 +#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200 + +#define KMSAN_POISON_NOCHECK 0x0 +#define KMSAN_POISON_CHECK 0x1 +#define KMSAN_POISON_FREE 0x2 + +#define KMSAN_ORIGIN_SIZE 4 +#define KMSAN_MAX_ORIGIN_DEPTH 7 + +#define KMSAN_STACK_DEPTH 64 + +#define KMSAN_META_SHADOW (false) +#define KMSAN_META_ORIGIN (true) + +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + +/* + * A pair of metadata pointers to be returned by the instrumentation functions. + */ +struct shadow_origin_ptr { + void *shadow, *origin; +}; + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, + bool store); +void *kmsan_get_metadata(void *addr, bool is_origin); +void __init kmsan_init_alloc_meta_for_range(void *start, void *end); + +enum kmsan_bug_reason { + REASON_ANY, + REASON_COPY_TO_USER, + REASON_SUBMIT_URB, +}; + +void kmsan_print_origin(depot_stack_handle_t origin); + +/** + * kmsan_report() - Report a use of uninitialized value. + * @origin: Stack ID of the uninitialized value. + * @address: Address at which the memory access happens. + * @size: Memory access size. + * @off_first: Offset (from @address) of the first byte to be reported. + * @off_last: Offset (from @address) of the last byte to be reported. + * @user_addr: When non-NULL, denotes the userspace address to which the kernel + * is leaking data. + * @reason: Error type from enum kmsan_bug_reason. + * + * kmsan_report() prints an error message for a consequent group of bytes + * sharing the same origin. If an uninitialized value is used in a comparison, + * this function is called once without specifying the addresses. When checking + * a memory range, KMSAN may call kmsan_report() multiple times with the same + * @address, @size, @user_addr and @reason, but different @off_first and + * @off_last corresponding to different @origin values. + */ +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason); + +DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +static __always_inline struct kmsan_ctx *kmsan_get_context(void) +{ + return in_task() ? ¤t->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx); +} + +/* + * When a compiler hook or KMSAN runtime function is invoked, it may make a + * call to instrumented code and eventually call itself recursively. To avoid + * that, we guard the runtime entry regions with + * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if + * kmsan_in_runtime() is true. + * + * Non-runtime code may occasionally get executed in nested IRQs from the + * runtime code (e.g. when called via smp_call_function_single()). Because some + * KMSAN routines may take locks (e.g. for memory allocation), we conservatively + * bail out instead of calling them. To minimize the effect of this (potentially + * missing initialization events) kmsan_in_runtime() is not checked in + * non-blocking runtime functions. + */ +static __always_inline bool kmsan_in_runtime(void) +{ + if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) + return true; + if (in_nmi()) + return true; + return kmsan_get_context()->kmsan_in_runtime; +} + +static __always_inline void kmsan_enter_runtime(void) +{ + struct kmsan_ctx *ctx; + + ctx = kmsan_get_context(); + KMSAN_WARN_ON(ctx->kmsan_in_runtime++); +} + +static __always_inline void kmsan_leave_runtime(void) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + KMSAN_WARN_ON(--ctx->kmsan_in_runtime); +} + +depot_stack_handle_t kmsan_save_stack(void); +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra_bits); + +/* + * Pack and unpack the origin chain depth and UAF flag to/from the extra bits + * provided by the stack depot. + * The UAF flag is stored in the lowest bit, followed by the depth in the upper + * bits. + * set_dsh_extra_bits() is responsible for clamping the value. + */ +static __always_inline unsigned int kmsan_extra_bits(unsigned int depth, + bool uaf) +{ + return (depth << 1) | uaf; +} + +static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits) +{ + return extra_bits & 1; +} + +static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits) +{ + return extra_bits >> 1; +} + +/* + * kmsan_internal_ functions are supposed to be very simple and not require the + * kmsan_in_runtime() checks. + */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n); +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags); +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked); +void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, + u32 origin, bool checked); +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); + +void kmsan_internal_task_create(struct task_struct *task); + +bool kmsan_metadata_is_contiguous(void *addr, size_t size); +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason); + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order); + +/* + * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are + * non-instrumented versions of is_module_address() and is_vmalloc_addr() that + * are safe to call from KMSAN runtime without recursion. + */ +static inline bool kmsan_internal_is_module_addr(void *vaddr) +{ + return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END); +} + +static inline bool kmsan_internal_is_vmalloc_addr(void *addr) +{ + return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END); +} + +#endif /* __MM_KMSAN_KMSAN_H */ diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c new file mode 100644 index 000000000..1328636cb --- /dev/null +++ b/mm/kmsan/kmsan_test.c @@ -0,0 +1,582 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KMSAN. + * For each test case checks the presence (or absence) of generated reports. + * Relies on 'console' tracepoint to capture reports as they appear in the + * kernel log. + * + * Copyright (C) 2021-2022, Google LLC. + * Author: Alexander Potapenko + * + */ + +#include +#include "kmsan.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(int, per_cpu_var); + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + bool available; + bool ignore; /* Stop console output collection. */ + char header[256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + + if (observed.ignore) + return; + spin_lock_irqsave(&observed.lock, flags); + + if (strnstr(buf, "BUG: KMSAN: ", len)) { + /* + * KMSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.header, buf, + min(len + 1, sizeof(observed.header))); + WRITE_ONCE(observed.available, true); + observed.ignore = true; + } + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.available); +} + +/* Information we expect in a report. */ +struct expect_report { + const char *error_type; /* Error type. */ + /* + * Kernel symbol from the error header, or NULL if no report is + * expected. + */ + const char *symbol; +}; + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + typeof(observed.header) expected_header; + unsigned long flags; + bool ret = false; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available() || !r->symbol) + return (!report_available() && !r->symbol); + + /* Generate expected report contents. */ + + /* Title */ + cur = expected_header; + end = &expected_header[sizeof(expected_header) - 1]; + + cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type); + + scnprintf(cur, end - cur, " in %s", r->symbol); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expected_header, '+'); + if (cur) + *cur = '\0'; + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.header, expected_header); +out: + spin_unlock_irqrestore(&observed.lock, flags); + + return ret; +} + +/* ===== Test cases ===== */ + +/* Prevent replacing branch with select in LLVM. */ +static noinline void check_true(char *arg) +{ + pr_info("%s is true\n", arg); +} + +static noinline void check_false(char *arg) +{ + pr_info("%s is false\n", arg); +} + +#define USE(x) \ + do { \ + if (x) \ + check_true(#x); \ + else \ + check_false(#x); \ + } while (0) + +#define EXPECTATION_ETYPE_FN(e, reason, fn) \ + struct expect_report e = { \ + .error_type = reason, \ + .symbol = fn, \ + } + +#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL) +#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \ + EXPECTATION_ETYPE_FN(e, "uninit-value", fn) +#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__) +#define EXPECTATION_USE_AFTER_FREE(e) \ + EXPECTATION_ETYPE_FN(e, "use-after-free", __func__) + +/* Test case: ensure that kmalloc() returns uninitialized memory. */ +static void test_uninit_kmalloc(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + int *ptr; + + kunit_info(test, "uninitialized kmalloc test (UMR report)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that kmalloc'ed memory becomes initialized after memset(). + */ +static void test_init_kmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kmalloc test (no reports)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + memset(ptr, 0, sizeof(*ptr)); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that kzalloc() returns initialized memory. */ +static void test_init_kzalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kzalloc test (no reports)\n"); + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables are uninitialized by default. */ +static void test_uninit_stack_var(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int cond; + + kunit_info(test, "uninitialized stack variable (UMR report)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables with initializers are initialized. */ +static void test_init_stack_var(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + volatile int cond = 1; + + kunit_info(test, "initialized stack variable (no reports)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void two_param_fn_2(int arg1, int arg2) +{ + USE(arg1); + USE(arg2); +} + +static noinline void one_param_fn(int arg) +{ + two_param_fn_2(arg, arg); + USE(arg); +} + +static noinline void two_param_fn(int arg1, int arg2) +{ + int init = 0; + + one_param_fn(init); + USE(arg1); + USE(arg2); +} + +static void test_params(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to two_param_fn(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_params"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn"); +#endif + volatile int uninit, init = 1; + + kunit_info(test, + "uninit passed through a function parameter (UMR report)\n"); + two_param_fn(uninit, init); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static int signed_sum3(int a, int b, int c) +{ + return a + b + c; +} + +/* + * Test case: ensure that uninitialized values are tracked through function + * arguments. + */ +static void test_uninit_multiple_params(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile char b = 3, c; + volatile int a; + + kunit_info(test, "uninitialized local passed to fn (UMR report)\n"); + USE(signed_sum3(a, b, c)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Helper function to make an array uninitialized. */ +static noinline void do_uninit_local_array(char *array, int start, int stop) +{ + volatile char uninit; + + for (int i = start; i < stop; i++) + array[i] = uninit; +} + +/* + * Test case: ensure kmsan_check_memory() reports an error when checking + * uninitialized memory. + */ +static void test_uninit_kmsan_check_memory(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory"); + volatile char local_array[8]; + + kunit_info( + test, + "kmsan_check_memory() called on uninit local (UMR report)\n"); + do_uninit_local_array((char *)local_array, 5, 7); + + kmsan_check_memory((char *)local_array, 8); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: check that a virtual memory range created with vmap() from + * initialized pages is still considered as initialized. + */ +static void test_init_kmsan_vmap_vunmap(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + const int npages = 2; + struct page **pages; + void *vbuf; + + kunit_info(test, "pages initialized via vmap (no reports)\n"); + + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + for (int i = 0; i < npages; i++) + pages[i] = alloc_page(GFP_KERNEL); + vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + memset(vbuf, 0xfe, npages * PAGE_SIZE); + for (int i = 0; i < npages; i++) + kmsan_check_memory(page_address(pages[i]), PAGE_SIZE); + + if (vbuf) + vunmap(vbuf); + for (int i = 0; i < npages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memset() can initialize a buffer allocated via + * vmalloc(). + */ +static void test_init_vmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int npages = 8; + char *buf; + + kunit_info(test, "vmalloc buffer can be initialized (no reports)\n"); + buf = vmalloc(PAGE_SIZE * npages); + buf[0] = 1; + memset(buf, 0xfe, PAGE_SIZE * npages); + USE(buf[0]); + for (int i = 0; i < npages; i++) + kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE); + vfree(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that use-after-free reporting works. */ +static void test_uaf(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile int value; + volatile int *var; + + kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n"); + var = kmalloc(80, GFP_KERNEL); + var[3] = 0xfeedface; + kfree((int *)var); + /* Copy the invalid value before checking it. */ + value = var[3]; + USE(value); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that uninitialized values are propagated through per-CPU + * memory. + */ +static void test_percpu_propagate(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int uninit, check; + + kunit_info(test, + "uninit local stored to per_cpu memory (UMR report)\n"); + + this_cpu_write(per_cpu_var, uninit); + check = this_cpu_read(per_cpu_var); + USE(check); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that passing uninitialized values to printk() leads to an + * error report. + */ +static void test_printk(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to pr_info(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "number"); +#endif + volatile int uninit; + + kunit_info(test, "uninit local passed to pr_info() (UMR report)\n"); + pr_info("%px contains %d\n", &uninit, uninit); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and `dst`. + */ +static void test_memcpy_aligned_to_aligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned"); + volatile int uninit_src; + volatile int dst = 0; + + kunit_info( + test, + "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst, sizeof(dst)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the first of the two values. + */ +static void test_memcpy_aligned_to_unaligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)dst, 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the second of the two values. + */ +static void test_memcpy_aligned_to_unaligned2(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_memcpy_aligned_to_unaligned2"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void fibonacci(int *array, int size, int start) { + if (start < 2 || (start == size)) + return; + array[start] = array[start - 1] + array[start - 2]; + fibonacci(array, size, start + 1); +} + +static void test_long_origin_chain(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_long_origin_chain"); + /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */ + volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2]; + int last = ARRAY_SIZE(accum) - 1; + + kunit_info( + test, + "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n"); + /* + * We do not set accum[1] to 0, so the uninitializedness will be carried + * over to accum[2..last]. + */ + accum[0] = 1; + fibonacci((int *)accum, ARRAY_SIZE(accum), 2); + kmsan_check_memory((void *)&accum[last], sizeof(int)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static struct kunit_case kmsan_test_cases[] = { + KUNIT_CASE(test_uninit_kmalloc), + KUNIT_CASE(test_init_kmalloc), + KUNIT_CASE(test_init_kzalloc), + KUNIT_CASE(test_uninit_stack_var), + KUNIT_CASE(test_init_stack_var), + KUNIT_CASE(test_params), + KUNIT_CASE(test_uninit_multiple_params), + KUNIT_CASE(test_uninit_kmsan_check_memory), + KUNIT_CASE(test_init_kmsan_vmap_vunmap), + KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uaf), + KUNIT_CASE(test_percpu_propagate), + KUNIT_CASE(test_printk), + KUNIT_CASE(test_memcpy_aligned_to_aligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned2), + KUNIT_CASE(test_long_origin_chain), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + observed.header[0] = '\0'; + observed.ignore = false; + observed.available = false; + spin_unlock_irqrestore(&observed.lock, flags); + + return 0; +} + +static void test_exit(struct kunit *test) +{ +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kmsan_suite_init(struct kunit_suite *suite) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kmsan_suite_exit(struct kunit_suite *suite) +{ + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static struct kunit_suite kmsan_test_suite = { + .name = "kmsan", + .test_cases = kmsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kmsan_suite_init, + .suite_exit = kmsan_suite_exit, +}; +kunit_test_suites(&kmsan_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alexander Potapenko "); diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c new file mode 100644 index 000000000..02736ec75 --- /dev/null +++ b/mm/kmsan/report.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN error reporting routines. + * + * Copyright (C) 2019-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include + +#include "kmsan.h" + +static DEFINE_RAW_SPINLOCK(kmsan_report_lock); +#define DESCR_SIZE 128 +/* Protected by kmsan_report_lock */ +static char report_local_descr[DESCR_SIZE]; +int panic_on_kmsan __read_mostly; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kmsan." +module_param_named(panic, panic_on_kmsan, int, 0); + +/* + * Skip internal KMSAN frames. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], + int num_entries) +{ + int len, skip; + char buf[64]; + + for (skip = 0; skip < num_entries; ++skip) { + len = scnprintf(buf, sizeof(buf), "%ps", + (void *)stack_entries[skip]); + + /* Never show __msan_* or kmsan_* functions. */ + if ((strnstr(buf, "__msan_", len) == buf) || + (strnstr(buf, "kmsan_", len) == buf)) + continue; + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; + } + + return skip; +} + +/* + * Currently the descriptions of locals generated by Clang look as follows: + * ----local_name@function_name + * We want to print only the name of the local, as other information in that + * description can be confusing. + * The meaningful part of the description is copied to a global buffer to avoid + * allocating memory. + */ +static char *pretty_descr(char *descr) +{ + int pos = 0, len = strlen(descr); + + for (int i = 0; i < len; i++) { + if (descr[i] == '@') + break; + if (descr[i] == '-') + continue; + report_local_descr[pos] = descr[i]; + if (pos + 1 == DESCR_SIZE) + break; + pos++; + } + report_local_descr[pos] = 0; + return report_local_descr; +} + +void kmsan_print_origin(depot_stack_handle_t origin) +{ + unsigned long *entries = NULL, *chained_entries = NULL; + unsigned int nr_entries, chained_nr_entries, skipnr; + void *pc1 = NULL, *pc2 = NULL; + depot_stack_handle_t head; + unsigned long magic; + char *descr = NULL; + unsigned int depth; + + if (!origin) + return; + + while (true) { + nr_entries = stack_depot_fetch(origin, &entries); + depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin)); + magic = nr_entries ? entries[0] : 0; + if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) { + descr = (char *)entries[1]; + pc1 = (void *)entries[2]; + pc2 = (void *)entries[3]; + pr_err("Local variable %s created at:\n", + pretty_descr(descr)); + if (pc1) + pr_err(" %pSb\n", pc1); + if (pc2) + pr_err(" %pSb\n", pc2); + break; + } + if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) { + /* + * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are + * not stored, so the output may be incomplete. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + pr_err("\n\n"); + head = entries[1]; + origin = entries[2]; + pr_err("Uninit was stored to memory at:\n"); + chained_nr_entries = + stack_depot_fetch(head, &chained_entries); + kmsan_internal_unpoison_memory( + chained_entries, + chained_nr_entries * sizeof(*chained_entries), + /*checked*/ false); + skipnr = get_stack_skipnr(chained_entries, + chained_nr_entries); + stack_trace_print(chained_entries + skipnr, + chained_nr_entries - skipnr, 0); + pr_err("\n"); + continue; + } + pr_err("Uninit was created at:\n"); + if (nr_entries) { + skipnr = get_stack_skipnr(entries, nr_entries); + stack_trace_print(entries + skipnr, nr_entries - skipnr, + 0); + } else { + pr_err("(stack is not available)\n"); + } + break; + } +} + +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason) +{ + unsigned long stack_entries[KMSAN_STACK_DEPTH]; + int num_stack_entries, skipnr; + char *bug_type = NULL; + unsigned long ua_flags; + bool is_uaf; + + if (!kmsan_enabled) + return; + if (!current->kmsan_ctx.allow_reporting) + return; + if (!origin) + return; + + current->kmsan_ctx.allow_reporting = false; + ua_flags = user_access_save(); + raw_spin_lock(&kmsan_report_lock); + pr_err("=====================================================\n"); + is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin)); + switch (reason) { + case REASON_ANY: + bug_type = is_uaf ? "use-after-free" : "uninit-value"; + break; + case REASON_COPY_TO_USER: + bug_type = is_uaf ? "kernel-infoleak-after-free" : + "kernel-infoleak"; + break; + case REASON_SUBMIT_URB: + bug_type = is_uaf ? "kernel-usb-infoleak-after-free" : + "kernel-usb-infoleak"; + break; + } + + num_stack_entries = + stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + + pr_err("BUG: KMSAN: %s in %pSb\n", bug_type, + (void *)stack_entries[skipnr]); + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + pr_err("\n"); + + kmsan_print_origin(origin); + + if (size) { + pr_err("\n"); + if (off_first == off_last) + pr_err("Byte %d of %d is uninitialized\n", off_first, + size); + else + pr_err("Bytes %d-%d of %d are uninitialized\n", + off_first, off_last, size); + } + if (address) + pr_err("Memory access of size %d starts at %px\n", size, + address); + if (user_addr && reason == REASON_COPY_TO_USER) + pr_err("Data copied to user address %px\n", user_addr); + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("=====================================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + raw_spin_unlock(&kmsan_report_lock); + if (panic_on_kmsan) + panic("kmsan.panic set ...\n"); + user_access_restore(ua_flags); + current->kmsan_ctx.allow_reporting = true; +} diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c new file mode 100644 index 000000000..b8bb95eea --- /dev/null +++ b/mm/kmsan/shadow.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN shadow implementation. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "kmsan.h" + +#define shadow_page_for(page) ((page)->kmsan_shadow) + +#define origin_page_for(page) ((page)->kmsan_origin) + +static void *shadow_ptr_for(struct page *page) +{ + return page_address(shadow_page_for(page)); +} + +static void *origin_ptr_for(struct page *page) +{ + return page_address(origin_page_for(page)); +} + +static bool page_has_metadata(struct page *page) +{ + return shadow_page_for(page) && origin_page_for(page); +} + +static void set_no_shadow_origin_page(struct page *page) +{ + shadow_page_for(page) = NULL; + origin_page_for(page) = NULL; +} + +/* + * Dummy load and store pages to be used when the real metadata is unavailable. + * There are separate pages for loads and stores, so that every load returns a + * zero, and every store doesn't affect other loads. + */ +static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static unsigned long vmalloc_meta(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr, off; + + KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE)); + if (kmsan_internal_is_vmalloc_addr(addr)) { + off = addr64 - VMALLOC_START; + return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START : + KMSAN_VMALLOC_SHADOW_START); + } + if (kmsan_internal_is_module_addr(addr)) { + off = addr64 - MODULES_VADDR; + return off + (is_origin ? KMSAN_MODULES_ORIGIN_START : + KMSAN_MODULES_SHADOW_START); + } + return 0; +} + +static struct page *virt_to_page_or_null(void *vaddr) +{ + if (kmsan_virt_addr_valid(vaddr)) + return virt_to_page(vaddr); + else + return NULL; +} + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size, + bool store) +{ + struct shadow_origin_ptr ret; + void *shadow; + + /* + * Even if we redirect this memory access to the dummy page, it will + * go out of bounds. + */ + KMSAN_WARN_ON(size > PAGE_SIZE); + + if (!kmsan_enabled) + goto return_dummy; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size)); + shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW); + if (!shadow) + goto return_dummy; + + ret.shadow = shadow; + ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN); + return ret; + +return_dummy: + if (store) { + /* Ignore this store. */ + ret.shadow = dummy_store_page; + ret.origin = dummy_store_page; + } else { + /* This load will return zero. */ + ret.shadow = dummy_load_page; + ret.origin = dummy_load_page; + } + return ret; +} + +/* + * Obtain the shadow or origin pointer for the given address, or NULL if there's + * none. The caller must check the return value for being non-NULL if needed. + * The return value of this function should not depend on whether we're in the + * runtime or not. + */ +void *kmsan_get_metadata(void *address, bool is_origin) +{ + u64 addr = (u64)address, pad, off; + struct page *page; + void *ret; + + if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { + pad = addr % KMSAN_ORIGIN_SIZE; + addr -= pad; + } + address = (void *)addr; + if (kmsan_internal_is_vmalloc_addr(address) || + kmsan_internal_is_module_addr(address)) + return (void *)vmalloc_meta(address, is_origin); + + ret = arch_kmsan_get_meta_or_null(address, is_origin); + if (ret) + return ret; + + page = virt_to_page_or_null(address); + if (!page) + return NULL; + if (!page_has_metadata(page)) + return NULL; + off = addr % PAGE_SIZE; + + return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; +} + +void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + if (!dst || !page_has_metadata(dst)) + return; + if (!src || !page_has_metadata(src)) { + kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE, + /*checked*/ false); + return; + } + + kmsan_enter_runtime(); + __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE); + __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_copy_page_meta); + +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) +{ + bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled; + struct page *shadow, *origin; + depot_stack_handle_t handle; + int pages = 1 << order; + + if (!page) + return; + + shadow = shadow_page_for(page); + origin = origin_page_for(page); + + if (initialized) { + __memset(page_address(shadow), 0, PAGE_SIZE * pages); + __memset(page_address(origin), 0, PAGE_SIZE * pages); + return; + } + + /* Zero pages allocated by the runtime should also be initialized. */ + if (kmsan_in_runtime()) + return; + + __memset(page_address(shadow), -1, PAGE_SIZE * pages); + kmsan_enter_runtime(); + handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0); + kmsan_leave_runtime(); + /* + * Addresses are page-aligned, pages are contiguous, so it's ok + * to just fill the origin pages with @handle. + */ + for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++) + ((depot_stack_handle_t *)page_address(origin))[i] = handle; +} + +void kmsan_free_page(struct page *page, unsigned int order) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(page_address(page), + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +int kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + unsigned long shadow_start, origin_start, shadow_end, origin_end; + struct page **s_pages, **o_pages; + int nr, mapped, err = 0; + + if (!kmsan_enabled) + return 0; + + shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); + shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); + if (!shadow_start) + return 0; + + nr = (end - start) / PAGE_SIZE; + s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); + o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + if (!s_pages || !o_pages) { + err = -ENOMEM; + goto ret; + } + for (int i = 0; i < nr; i++) { + s_pages[i] = shadow_page_for(pages[i]); + o_pages[i] = origin_page_for(pages[i]); + } + prot = __pgprot(pgprot_val(prot) | _PAGE_NX); + prot = PAGE_KERNEL; + + origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN); + origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN); + kmsan_enter_runtime(); + mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, + s_pages, page_shift); + if (mapped) { + err = mapped; + goto ret; + } + mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, + o_pages, page_shift); + if (mapped) { + err = mapped; + goto ret; + } + kmsan_leave_runtime(); + flush_tlb_kernel_range(shadow_start, shadow_end); + flush_tlb_kernel_range(origin_start, origin_end); + flush_cache_vmap(shadow_start, shadow_end); + flush_cache_vmap(origin_start, origin_end); + +ret: + kfree(s_pages); + kfree(o_pages); + return err; +} + +/* Allocate metadata for pages allocated at boot time. */ +void __init kmsan_init_alloc_meta_for_range(void *start, void *end) +{ + struct page *shadow_p, *origin_p; + void *shadow, *origin; + struct page *page; + u64 size; + + start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); + size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + shadow = memblock_alloc(size, PAGE_SIZE); + origin = memblock_alloc(size, PAGE_SIZE); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { + page = virt_to_page_or_null((char *)start + addr); + shadow_p = virt_to_page_or_null((char *)shadow + addr); + set_no_shadow_origin_page(shadow_p); + shadow_page_for(page) = shadow_p; + origin_p = virt_to_page_or_null((char *)origin + addr); + set_no_shadow_origin_page(origin_p); + origin_page_for(page) = origin_p; + } +} + +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order) +{ + for (int i = 0; i < (1 << order); i++) { + set_no_shadow_origin_page(&shadow[i]); + set_no_shadow_origin_page(&origin[i]); + shadow_page_for(&page[i]) = &shadow[i]; + origin_page_for(&page[i]) = &origin[i]; + } +} diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 000000000..cb272b6fd --- /dev/null +++ b/mm/ksm.c @@ -0,0 +1,3230 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" +#include "mm_slot.h" + +#ifdef CONFIG_NUMA +#define NUMA(x) (x) +#define DO_NUMA(x) do { (x); } while (0) +#else +#define NUMA(x) (0) +#define DO_NUMA(x) do { } while (0) +#endif + +/** + * DOC: Overview + * + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * The stable tree node includes information required for reverse + * mapping from a KSM page to virtual addresses that map this page. + * + * In order to avoid large latencies of the rmap walks on KSM pages, + * KSM maintains two types of nodes in the stable tree: + * + * * the regular nodes that keep the reverse mapping structures in a + * linked list + * * the "chains" that link nodes ("dups") that represent the same + * write protected memory content, but each "dup" corresponds to a + * different KSM page copy of that content + * + * Internally, the regular nodes, "dups" and "chains" are represented + * using the same struct ksm_stable_node structure. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + * + * If the merge_across_nodes tunable is unset, then KSM maintains multiple + * stable trees and multiple unstable trees: one of each for each NUMA node. + */ + +/** + * struct ksm_mm_slot - ksm information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot + * @rmap_list: head for this mm_slot's singly-linked list of rmap_items + */ +struct ksm_mm_slot { + struct mm_slot slot; + struct ksm_rmap_item *rmap_list; +}; + +/** + * struct ksm_scan - cursor for scanning + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * @rmap_list: link to the next rmap to be scanned in the rmap_list + * @seqnr: count of completed full scans (needed when removing unstable node) + * + * There is only the one ksm_scan instance of this cursor structure. + */ +struct ksm_scan { + struct ksm_mm_slot *mm_slot; + unsigned long address; + struct ksm_rmap_item **rmap_list; + unsigned long seqnr; +}; + +/** + * struct ksm_stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @hlist_dup: linked into the stable_node->hlist with a stable_node chain + * @list: linked into migrate_nodes, pending placement in the proper node tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @chain_prune_time: time of the last full garbage collection + * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN + * @nid: NUMA node id of stable tree in which linked (may not match kpfn) + */ +struct ksm_stable_node { + union { + struct rb_node node; /* when node of stable tree */ + struct { /* when listed for migration */ + struct list_head *head; + struct { + struct hlist_node hlist_dup; + struct list_head list; + }; + }; + }; + struct hlist_head hlist; + union { + unsigned long kpfn; + unsigned long chain_prune_time; + }; + /* + * STABLE_NODE_CHAIN can be any negative number in + * rmap_hlist_len negative range, but better not -1 to be able + * to reliably detect underflows. + */ +#define STABLE_NODE_CHAIN -1024 + int rmap_hlist_len; +#ifdef CONFIG_NUMA + int nid; +#endif +}; + +/** + * struct ksm_rmap_item - reverse mapping item for virtual addresses + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @nid: NUMA node id of unstable tree in which linked (may not match page) + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @oldchecksum: previous checksum of the page at that virtual address + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node + */ +struct ksm_rmap_item { + struct ksm_rmap_item *rmap_list; + union { + struct anon_vma *anon_vma; /* when stable */ +#ifdef CONFIG_NUMA + int nid; /* when node of unstable tree */ +#endif + }; + struct mm_struct *mm; + unsigned long address; /* + low bits used for flags below */ + unsigned int oldchecksum; /* when unstable */ + union { + struct rb_node node; /* when node of unstable tree */ + struct { /* when listed from stable tree */ + struct ksm_stable_node *head; + struct hlist_node hlist; + }; + }; +}; + +#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ +#define STABLE_FLAG 0x200 /* is listed from the stable tree */ + +/* The stable and unstable tree heads */ +static struct rb_root one_stable_tree[1] = { RB_ROOT }; +static struct rb_root one_unstable_tree[1] = { RB_ROOT }; +static struct rb_root *root_stable_tree = one_stable_tree; +static struct rb_root *root_unstable_tree = one_unstable_tree; + +/* Recently migrated nodes of stable tree, pending proper placement */ +static LIST_HEAD(migrate_nodes); +#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) + +#define MM_SLOTS_HASH_BITS 10 +static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); + +static struct ksm_mm_slot ksm_mm_head = { + .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), +}; +static struct ksm_scan ksm_scan = { + .mm_slot = &ksm_mm_head, +}; + +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; +static struct kmem_cache *mm_slot_cache; + +/* The number of nodes in the stable tree */ +static unsigned long ksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long ksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + +/* The number of stable_node chains */ +static unsigned long ksm_stable_node_chains; + +/* The number of stable_node dups linked to the stable_node chains */ +static unsigned long ksm_stable_node_dups; + +/* Delay in pruning stale stable_node_dups in the stable_node_chains */ +static unsigned int ksm_stable_node_chains_prune_millisecs = 2000; + +/* Maximum number of page slots sharing a stable node */ +static int ksm_max_page_sharing = 256; + +/* Number of pages ksmd should scan in one batch */ +static unsigned int ksm_thread_pages_to_scan = 100; + +/* Milliseconds ksmd should sleep between batches */ +static unsigned int ksm_thread_sleep_millisecs = 20; + +/* Checksum of an empty (zeroed) page */ +static unsigned int zero_checksum __read_mostly; + +/* Whether to merge empty (zeroed) pages with actual zero pages */ +static bool ksm_use_zero_pages __read_mostly; + +#ifdef CONFIG_NUMA +/* Zeroed when merging across nodes is not allowed */ +static unsigned int ksm_merge_across_nodes = 1; +static int ksm_nr_node_ids = 1; +#else +#define ksm_merge_across_nodes 1U +#define ksm_nr_node_ids 1 +#endif + +#define KSM_RUN_STOP 0 +#define KSM_RUN_MERGE 1 +#define KSM_RUN_UNMERGE 2 +#define KSM_RUN_OFFLINE 4 +static unsigned long ksm_run = KSM_RUN_STOP; +static void wait_while_offlining(void); + +static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); +static DEFINE_MUTEX(ksm_thread_mutex); +static DEFINE_SPINLOCK(ksm_mmlist_lock); + +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +static int __init ksm_slab_init(void) +{ + rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0); + if (!rmap_item_cache) + goto out; + + stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0); + if (!stable_node_cache) + goto out_free1; + + mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0); + if (!mm_slot_cache) + goto out_free2; + + return 0; + +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init ksm_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(stable_node_cache); + kmem_cache_destroy(rmap_item_cache); + mm_slot_cache = NULL; +} + +static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) +{ + return chain->rmap_hlist_len == STABLE_NODE_CHAIN; +} + +static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) +{ + return dup->head == STABLE_NODE_DUP_HEAD; +} + +static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, + struct ksm_stable_node *chain) +{ + VM_BUG_ON(is_stable_node_dup(dup)); + dup->head = STABLE_NODE_DUP_HEAD; + VM_BUG_ON(!is_stable_node_chain(chain)); + hlist_add_head(&dup->hlist_dup, &chain->hlist); + ksm_stable_node_dups++; +} + +static inline void __stable_node_dup_del(struct ksm_stable_node *dup) +{ + VM_BUG_ON(!is_stable_node_dup(dup)); + hlist_del(&dup->hlist_dup); + ksm_stable_node_dups--; +} + +static inline void stable_node_dup_del(struct ksm_stable_node *dup) +{ + VM_BUG_ON(is_stable_node_chain(dup)); + if (is_stable_node_dup(dup)) + __stable_node_dup_del(dup); + else + rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); +#ifdef CONFIG_DEBUG_VM + dup->head = NULL; +#endif +} + +static inline struct ksm_rmap_item *alloc_rmap_item(void) +{ + struct ksm_rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | + __GFP_NORETRY | __GFP_NOWARN); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; +} + +static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) +{ + ksm_rmap_items--; + rmap_item->mm->ksm_rmap_items--; + rmap_item->mm = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct ksm_stable_node *alloc_stable_node(void) +{ + /* + * The allocation can take too long with GFP_KERNEL when memory is under + * pressure, which may lead to hung task warnings. Adding __GFP_HIGH + * grants access to memory reserves, helping to avoid this problem. + */ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); +} + +static inline void free_stable_node(struct ksm_stable_node *stable_node) +{ + VM_BUG_ON(stable_node->rmap_hlist_len && + !is_stable_node_chain(stable_node)); + kmem_cache_free(stable_node_cache, stable_node); +} + +/* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_lock briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem, where we would not want to touch it. + * + * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * of the process that owns 'vma'. We also do not want to enforce + * protection keys here anyway. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + vm_fault_t ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); + if (IS_ERR_OR_NULL(page)) + break; + if (PageKsm(page)) + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + NULL); + else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_area_struct *vma; + if (ksm_test_exit(mm)) + return NULL; + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + return NULL; + return vma; +} + +static void break_cow(struct ksm_rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + put_anon_vma(rmap_item->anon_vma); + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, addr); + if (vma) + break_ksm(vma, addr); + mmap_read_unlock(mm); +} + +static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + struct page *page; + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, addr); + if (!vma) + goto out; + + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + if (is_zone_device_page(page)) + goto out_putpage; + if (PageAnon(page)) { + flush_anon_page(vma, page, addr); + flush_dcache_page(page); + } else { +out_putpage: + put_page(page); +out: + page = NULL; + } + mmap_read_unlock(mm); + return page; +} + +/* + * This helper is used for getting right index into array of tree roots. + * When merge_across_nodes knob is set to 1, there are only two rb-trees for + * stable and unstable pages from all nodes with roots in index 0. Otherwise, + * every node has its own stable and unstable tree. + */ +static inline int get_kpfn_nid(unsigned long kpfn) +{ + return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); +} + +static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, + struct rb_root *root) +{ + struct ksm_stable_node *chain = alloc_stable_node(); + VM_BUG_ON(is_stable_node_chain(dup)); + if (likely(chain)) { + INIT_HLIST_HEAD(&chain->hlist); + chain->chain_prune_time = jiffies; + chain->rmap_hlist_len = STABLE_NODE_CHAIN; +#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) + chain->nid = NUMA_NO_NODE; /* debug */ +#endif + ksm_stable_node_chains++; + + /* + * Put the stable node chain in the first dimension of + * the stable tree and at the same time remove the old + * stable node. + */ + rb_replace_node(&dup->node, &chain->node, root); + + /* + * Move the old stable node to the second dimension + * queued in the hlist_dup. The invariant is that all + * dup stable_nodes in the chain->hlist point to pages + * that are write protected and have the exact same + * content. + */ + stable_node_chain_add_dup(dup, chain); + } + return chain; +} + +static inline void free_stable_node_chain(struct ksm_stable_node *chain, + struct rb_root *root) +{ + rb_erase(&chain->node, root); + free_stable_node(chain); + ksm_stable_node_chains--; +} + +static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) +{ + struct ksm_rmap_item *rmap_item; + + /* check it's not STABLE_NODE_CHAIN or negative */ + BUG_ON(stable_node->rmap_hlist_len < 0); + + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + + rmap_item->mm->ksm_merging_pages--; + + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; + put_anon_vma(rmap_item->anon_vma); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + /* + * We need the second aligned pointer of the migrate_nodes + * list_head to stay clear from the rb_parent_color union + * (aligned and different than any node) and also different + * from &migrate_nodes. This will verify that future list.h changes + * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. + */ + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); + + if (stable_node->head == &migrate_nodes) + list_del(&stable_node->list); + else + stable_node_dup_del(stable_node); + free_stable_node(stable_node); +} + +enum get_ksm_page_flags { + GET_KSM_PAGE_NOLOCK, + GET_KSM_PAGE_LOCK, + GET_KSM_PAGE_TRYLOCK +}; + +/* + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * But beware, the stable node's page might be being migrated. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * is on its way to being freed; but it is an anomaly to bear in mind. + */ +static struct page *get_ksm_page(struct ksm_stable_node *stable_node, + enum get_ksm_page_flags flags) +{ + struct page *page; + void *expected_mapping; + unsigned long kpfn; + + expected_mapping = (void *)((unsigned long)stable_node | + PAGE_MAPPING_KSM); +again: + kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ + page = pfn_to_page(kpfn); + if (READ_ONCE(page->mapping) != expected_mapping) + goto stale; + + /* + * We cannot do anything with the page while its refcount is 0. + * Usually 0 means free, or tail of a higher-order page: in which + * case this node is no longer referenced, and should be freed; + * however, it might mean that the page is under page_ref_freeze(). + * The __remove_mapping() case is easy, again the node is now stale; + * the same is in reuse_ksm_page() case; but if page is swapcache + * in folio_migrate_mapping(), it might still be our page, + * in which case it's essential to keep the node. + */ + while (!get_page_unless_zero(page)) { + /* + * Another check for page->mapping != expected_mapping would + * work here too. We have chosen the !PageSwapCache test to + * optimize the common case, when the page is or is about to + * be freed: PageSwapCache is cleared (under spin_lock_irq) + * in the ref_freeze section of __remove_mapping(); but Anon + * page->mapping reset to NULL later, in free_pages_prepare(). + */ + if (!PageSwapCache(page)) + goto stale; + cpu_relax(); + } + + if (READ_ONCE(page->mapping) != expected_mapping) { + put_page(page); + goto stale; + } + + if (flags == GET_KSM_PAGE_TRYLOCK) { + if (!trylock_page(page)) { + put_page(page); + return ERR_PTR(-EBUSY); + } + } else if (flags == GET_KSM_PAGE_LOCK) + lock_page(page); + + if (flags != GET_KSM_PAGE_NOLOCK) { + if (READ_ONCE(page->mapping) != expected_mapping) { + unlock_page(page); + put_page(page); + goto stale; + } + } + return page; + +stale: + /* + * We come here from above when page->mapping or !PageSwapCache + * suggests that the node is stale; but it might be under migration. + * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), + * before checking whether node->kpfn has been changed. + */ + smp_rmb(); + if (READ_ONCE(stable_node->kpfn) != kpfn) + goto again; + remove_node_from_stable_tree(stable_node); + return NULL; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) +{ + if (rmap_item->address & STABLE_FLAG) { + struct ksm_stable_node *stable_node; + struct page *page; + + stable_node = rmap_item->head; + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); + if (!page) + goto out; + + hlist_del(&rmap_item->hlist); + unlock_page(page); + put_page(page); + + if (!hlist_empty(&stable_node->hlist)) + ksm_pages_sharing--; + else + ksm_pages_shared--; + + rmap_item->mm->ksm_merging_pages--; + + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; + + put_anon_vma(rmap_item->anon_vma); + rmap_item->head = NULL; + rmap_item->address &= PAGE_MASK; + + } else if (rmap_item->address & UNSTABLE_FLAG) { + unsigned char age; + /* + * Usually ksmd can and must skip the rb_erase, because + * root_unstable_tree was already reset to RB_ROOT. + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. + */ + age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); + BUG_ON(age > 1); + if (!age) + rb_erase(&rmap_item->node, + root_unstable_tree + NUMA(rmap_item->nid)); + ksm_pages_unshared--; + rmap_item->address &= PAGE_MASK; + } +out: + cond_resched(); /* we're called from many long loops */ +} + +static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) +{ + while (*rmap_list) { + struct ksm_rmap_item *rmap_item = *rmap_list; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + } +} + +/* + * Though it's very tempting to unmerge rmap_items from stable tree rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) +{ + return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; +} + +static inline struct ksm_stable_node *page_stable_node(struct page *page) +{ + return folio_stable_node(page_folio(page)); +} + +static inline void set_page_stable_node(struct page *page, + struct ksm_stable_node *stable_node) +{ + VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +} + +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int remove_stable_node(struct ksm_stable_node *stable_node) +{ + struct page *page; + int err; + + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); + if (!page) { + /* + * get_ksm_page did remove_node_from_stable_tree itself. + */ + return 0; + } + + /* + * Page could be still mapped if this races with __mmput() running in + * between ksm_exit() and exit_mmap(). Just refuse to let + * merge_across_nodes/max_page_sharing be switched. + */ + err = -EBUSY; + if (!page_mapped(page)) { + /* + * The stable node did not yet appear stale to get_ksm_page(), + * since that allows for an unmapped ksm page to be recognized + * right up until it is freed; but the node is safe to remove. + * This page might be in a pagevec waiting to be freed, + * or it might be PageSwapCache (perhaps under writeback), + * or it might have been removed from swapcache a moment ago. + */ + set_page_stable_node(page, NULL); + remove_node_from_stable_tree(stable_node); + err = 0; + } + + unlock_page(page); + put_page(page); + return err; +} + +static int remove_stable_node_chain(struct ksm_stable_node *stable_node, + struct rb_root *root) +{ + struct ksm_stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + if (remove_stable_node(stable_node)) + return true; + else + return false; + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + if (remove_stable_node(dup)) + return true; + } + BUG_ON(!hlist_empty(&stable_node->hlist)); + free_stable_node_chain(stable_node, root); + return false; +} + +static int remove_all_stable_nodes(void) +{ + struct ksm_stable_node *stable_node, *next; + int nid; + int err = 0; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + while (root_stable_tree[nid].rb_node) { + stable_node = rb_entry(root_stable_tree[nid].rb_node, + struct ksm_stable_node, node); + if (remove_stable_node_chain(stable_node, + root_stable_tree + nid)) { + err = -EBUSY; + break; /* proceed to next nid */ + } + cond_resched(); + } + } + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { + if (remove_stable_node(stable_node)) + err = -EBUSY; + cond_resched(); + } + return err; +} + +static int unmerge_and_remove_all_rmap_items(void) +{ + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(ksm_mm_head.slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + spin_unlock(&ksm_mmlist_lock); + + for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; + mm_slot = ksm_scan.mm_slot) { + VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); + + mm = mm_slot->slot.mm; + mmap_read_lock(mm); + + /* + * Exit right away if mm is exiting to avoid lockdep issue in + * the maple tree + */ + if (ksm_test_exit(mm)) + goto mm_exiting; + + for_each_vma(vmi, vma) { + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; + } + +mm_exiting: + remove_trailing_rmap_items(&mm_slot->rmap_list); + mmap_read_unlock(mm); + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + if (ksm_test_exit(mm)) { + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); + spin_unlock(&ksm_mmlist_lock); + + mm_slot_free(mm_slot_cache, mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else + spin_unlock(&ksm_mmlist_lock); + } + + /* Clean up stable nodes, but don't worry if some are still busy */ + remove_all_stable_nodes(); + ksm_scan.seqnr = 0; + return 0; + +error: + mmap_read_unlock(mm); + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = &ksm_mm_head; + spin_unlock(&ksm_mmlist_lock); + return err; +} +#endif /* CONFIG_SYSFS */ + +static u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page); + checksum = xxhash(addr, PAGE_SIZE, 0); + kunmap_atomic(addr); + return checksum; +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); + int swapped; + int err = -EFAULT; + struct mmu_notifier_range range; + bool anon_exclusive; + + pvmw.address = page_address_in_vma(page, vma); + if (pvmw.address == -EFAULT) + goto out; + + BUG_ON(PageTransCompound(page)); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + pvmw.address, + pvmw.address + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + if (!page_vma_mapped_walk(&pvmw)) + goto out_mn; + if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) + goto out_unlock; + + anon_exclusive = PageAnonExclusive(page); + if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || + anon_exclusive || mm_tlb_flush_pending(mm)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, pvmw.address, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesn't + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racy and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + * + * No need to notify as we are downgrading page table to read + * only not changing it to point to a new page. + * + * See Documentation/mm/mmu_notifier.rst + */ + entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) + 1 + swapped != page_count(page)) { + set_pte_at(mm, pvmw.address, pvmw.pte, entry); + goto out_unlock; + } + + /* See page_try_share_anon_rmap(): clear PTE first. */ + if (anon_exclusive && page_try_share_anon_rmap(page)) { + set_pte_at(mm, pvmw.address, pvmw.pte, entry); + goto out_unlock; + } + + if (pte_dirty(entry)) + set_page_dirty(page); + + if (pte_protnone(entry)) + entry = pte_mkclean(pte_clear_savedwrite(entry)); + else + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + } + *orig_pte = *pvmw.pte; + err = 0; + +out_unlock: + page_vma_mapped_walk_done(&pvmw); +out_mn: + mmu_notifier_invalidate_range_end(&range); +out: + return err; +} + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + struct folio *folio; + pmd_t *pmd; + pmd_t pmde; + pte_t *ptep; + pte_t newpte; + spinlock_t *ptl; + unsigned long addr; + int err = -EFAULT; + struct mmu_notifier_range range; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pmd = mm_find_pmd(mm, addr); + if (!pmd) + goto out; + /* + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + goto out; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out_mn; + } + VM_BUG_ON_PAGE(PageAnonExclusive(page), page); + VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage); + + /* + * No need to check ksm_use_zero_pages here: we can only have a + * zero_page here if ksm_use_zero_pages was enabled already. + */ + if (!is_zero_pfn(page_to_pfn(kpage))) { + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); + newpte = mk_pte(kpage, vma->vm_page_prot); + } else { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), + vma->vm_page_prot)); + /* + * We're replacing an anonymous page with a zero page, which is + * not anonymous. We need to do proper accounting otherwise we + * will get wrong values in /proc, and a BUG message in dmesg + * when tearing down the mm. + */ + dec_mm_counter(mm, MM_ANONPAGES); + } + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + /* + * No need to notify as we are replacing a read only page with another + * read only page with the same content. + * + * See Documentation/mm/mmu_notifier.rst + */ + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, newpte); + + folio = page_folio(page); + page_remove_rmap(page, vma, false); + if (!folio_mapped(folio)) + folio_free_swap(folio); + folio_put(folio); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out_mn: + mmu_notifier_invalidate_range_end(&range); +out: + return err; +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * @vma: the vma that holds the pte pointing to page + * @page: the PageAnon page that we want to replace with kpage + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_one_page(struct vm_area_struct *vma, + struct page *page, struct page *kpage) +{ + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (page == kpage) /* ksm page forked */ + return 0; + + if (!PageAnon(page)) + goto out; + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(page)) + goto out; + + if (PageTransCompound(page)) { + if (split_huge_page(page)) + goto out_unlock; + } + + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + /* + * Page reclaim just frees a clean page with no dirty + * ptes: make sure that the ksm page would be swapped. + */ + if (!PageDirty(page)) + SetPageDirty(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + } + +out_unlock: + unlock_page(page); +out: + return err; +} + +/* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, + struct page *page, struct page *kpage) +{ + struct mm_struct *mm = rmap_item->mm; + struct vm_area_struct *vma; + int err = -EFAULT; + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, rmap_item->address); + if (!vma) + goto out; + + err = try_to_merge_one_page(vma, page, kpage); + if (err) + goto out; + + /* Unstable nid is in union with stable anon_vma: remove first */ + remove_rmap_item_from_tree(rmap_item); + + /* Must get reference to anon_vma while still holding mmap_lock */ + rmap_item->anon_vma = vma->anon_vma; + get_anon_vma(vma->anon_vma); +out: + mmap_read_unlock(mm); + return err; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them + * to be merged into one page. + * + * This function returns the kpage if we successfully merged two identical + * pages into one ksm page, NULL otherwise. + * + * Note that this function upgrades page to ksm page: if one of the pages + * is already a ksm page, try_to_merge_with_ksm_page should be used. + */ +static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, + struct page *page, + struct ksm_rmap_item *tree_rmap_item, + struct page *tree_page) +{ + int err; + + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); + if (!err) { + err = try_to_merge_with_ksm_page(tree_rmap_item, + tree_page, page); + /* + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. + */ + if (err) + break_cow(rmap_item); + } + return err ? NULL : page; +} + +static __always_inline +bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) +{ + VM_BUG_ON(stable_node->rmap_hlist_len < 0); + /* + * Check that at least one mapping still exists, otherwise + * there's no much point to merge and share with this + * stable_node, as the underlying tree_page of the other + * sharer is going to be freed soon. + */ + return stable_node->rmap_hlist_len && + stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; +} + +static __always_inline +bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) +{ + return __is_page_sharing_candidate(stable_node, 0); +} + +static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct hlist_node *hlist_safe; + struct page *_tree_page, *tree_page = NULL; + int nr = 0; + int found_rmap_hlist_len; + + if (!prune_stale_stable_nodes || + time_before(jiffies, stable_node->chain_prune_time + + msecs_to_jiffies( + ksm_stable_node_chains_prune_millisecs))) + prune_stale_stable_nodes = false; + else + stable_node->chain_prune_time = jiffies; + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + cond_resched(); + /* + * We must walk all stable_node_dup to prune the stale + * stable nodes during lookup. + * + * get_ksm_page can drop the nodes from the + * stable_node->hlist if they point to freed pages + * (that's why we do a _safe walk). The "dup" + * stable_node parameter itself will be freed from + * under us if it returns NULL. + */ + _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); + if (!_tree_page) + continue; + nr += 1; + if (is_page_sharing_candidate(dup)) { + if (!found || + dup->rmap_hlist_len > found_rmap_hlist_len) { + if (found) + put_page(tree_page); + found = dup; + found_rmap_hlist_len = found->rmap_hlist_len; + tree_page = _tree_page; + + /* skip put_page for found dup */ + if (!prune_stale_stable_nodes) + break; + continue; + } + } + put_page(_tree_page); + } + + if (found) { + /* + * nr is counting all dups in the chain only if + * prune_stale_stable_nodes is true, otherwise we may + * break the loop at nr == 1 even if there are + * multiple entries. + */ + if (prune_stale_stable_nodes && nr == 1) { + /* + * If there's not just one entry it would + * corrupt memory, better BUG_ON. In KSM + * context with no lock held it's not even + * fatal. + */ + BUG_ON(stable_node->hlist.first->next); + + /* + * There's just one entry and it is below the + * deduplication limit so drop the chain. + */ + rb_replace_node(&stable_node->node, &found->node, + root); + free_stable_node(stable_node); + ksm_stable_node_chains--; + ksm_stable_node_dups--; + /* + * NOTE: the caller depends on the stable_node + * to be equal to stable_node_dup if the chain + * was collapsed. + */ + *_stable_node = found; + /* + * Just for robustness, as stable_node is + * otherwise left as a stable pointer, the + * compiler shall optimize it away at build + * time. + */ + stable_node = NULL; + } else if (stable_node->hlist.first != &found->hlist_dup && + __is_page_sharing_candidate(found, 1)) { + /* + * If the found stable_node dup can accept one + * more future merge (in addition to the one + * that is underway) and is not at the head of + * the chain, put it there so next search will + * be quicker in the !prune_stale_stable_nodes + * case. + * + * NOTE: it would be inaccurate to use nr > 1 + * instead of checking the hlist.first pointer + * directly, because in the + * prune_stale_stable_nodes case "nr" isn't + * the position of the found dup in the chain, + * but the total number of dups in the chain. + */ + hlist_del(&found->hlist_dup); + hlist_add_head(&found->hlist_dup, + &stable_node->hlist); + } + } + + *_stable_node_dup = found; + return tree_page; +} + +static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, + struct rb_root *root) +{ + if (!is_stable_node_chain(stable_node)) + return stable_node; + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return NULL; + } + return hlist_entry(stable_node->hlist.first, + typeof(*stable_node), hlist_dup); +} + +/* + * Like for get_ksm_page, this function can free the *_stable_node and + * *_stable_node_dup if the returned tree_page is NULL. + * + * It can also free and overwrite *_stable_node with the found + * stable_node_dup if the chain is collapsed (in which case + * *_stable_node will be equal to *_stable_node_dup like if the chain + * never existed). It's up to the caller to verify tree_page is not + * NULL before dereferencing *_stable_node or *_stable_node_dup. + * + * *_stable_node_dup is really a second output parameter of this + * function and will be overwritten in all cases, the caller doesn't + * need to initialize it. + */ +static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct ksm_stable_node *stable_node = *_stable_node; + if (!is_stable_node_chain(stable_node)) { + if (is_page_sharing_candidate(stable_node)) { + *_stable_node_dup = stable_node; + return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); + } + /* + * _stable_node_dup set to NULL means the stable_node + * reached the ksm_max_page_sharing limit. + */ + *_stable_node_dup = NULL; + return NULL; + } + return stable_node_dup(_stable_node_dup, _stable_node, root, + prune_stale_stable_nodes); +} + +static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, + struct rb_root *root) +{ + return __stable_node_chain(s_n_d, s_n, root, true); +} + +static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, + struct rb_root *root) +{ + struct ksm_stable_node *old_stable_node = s_n; + struct page *tree_page; + + tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + /* not pruning dups so s_n cannot have changed */ + VM_BUG_ON(s_n != old_stable_node); + return tree_page; +} + +/* + * stable_tree_search - search for page inside the stable tree + * + * This function checks if there is a page inside the stable tree + * with identical content to the page that we are scanning right now. + * + * This function returns the stable tree node of identical content if found, + * NULL otherwise. + */ +static struct page *stable_tree_search(struct page *page) +{ + int nid; + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *page_node; + + page_node = page_stable_node(page); + if (page_node && page_node->head != &migrate_nodes) { + /* ksm page forked */ + get_page(page); + return page; + } + + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_stable_tree + nid; +again: + new = &root->rb_node; + parent = NULL; + + while (*new) { + struct page *tree_page; + int ret; + + cond_resched(); + stable_node = rb_entry(*new, struct ksm_stable_node, node); + stable_node_any = NULL; + tree_page = chain_prune(&stable_node_dup, &stable_node, root); + /* + * NOTE: stable_node may have been freed by + * chain_prune() if the returned stable_node_dup is + * not NULL. stable_node_dup may have been inserted in + * the rbtree instead as a regular stable_node (in + * order to collapse the stable_node chain if a single + * stable_node dup was found in it). In such case the + * stable_node is overwritten by the callee to point + * to the stable_node_dup that was collapsed in the + * stable rbtree and stable_node will be equal to + * stable_node_dup like if the chain never existed. + */ + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * write protected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } + + ret = memcmp_pages(page, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + /* + * Test if the migrated page should be merged + * into a stable node dup. If the mapcount is + * 1 we can migrate it with another KSM page + * without adding it to the chain. + */ + if (page_mapcount(page) > 1) + goto chain_append; + } + + if (!stable_node_dup) { + /* + * If the stable_node is a chain and + * we got a payload match in memcmp + * but we cannot merge the scanned + * page in any of the existing + * stable_node dups because they're + * all full, we need to wait the + * scanned page to find itself a match + * in the unstable tree to create a + * brand new KSM page to add later to + * the dups of this stable_node. + */ + return NULL; + } + + /* + * Lock and unlock the stable_node's page (which + * might already have been migrated) so that page + * migration is sure to notice its raised count. + * It would be more elegant to return stable_node + * than kpage, but that involves more changes. + */ + tree_page = get_ksm_page(stable_node_dup, + GET_KSM_PAGE_TRYLOCK); + + if (PTR_ERR(tree_page) == -EBUSY) + return ERR_PTR(-EBUSY); + + if (unlikely(!tree_page)) + /* + * The tree may have been rebalanced, + * so re-evaluate parent and new. + */ + goto again; + unlock_page(tree_page); + + if (get_kpfn_nid(stable_node_dup->kpfn) != + NUMA(stable_node_dup->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; + } + } + + if (!page_node) + return NULL; + + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_link_node(&page_node->node, parent, new); + rb_insert_color(&page_node->node, root); +out: + if (is_page_sharing_candidate(page_node)) { + get_page(page); + return page; + } else + return NULL; + +replace: + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* there is no chain */ + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node_dup->node, + &page_node->node, + root); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + rb_erase(&stable_node_dup->node, root); + page = NULL; + } + } else { + VM_BUG_ON(!is_stable_node_chain(stable_node)); + __stable_node_dup_del(stable_node_dup); + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + page = NULL; + } + } + stable_node_dup->head = &migrate_nodes; + list_add(&stable_node_dup->list, stable_node_dup->head); + return page; + +chain_append: + /* stable_node_dup could be null if it reached the limit */ + if (!stable_node_dup) + stable_node_dup = stable_node_any; + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. + */ + if (stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(stable_node_dup, + root); + if (!stable_node) + return NULL; + } + /* + * Add this stable_node dup that was + * migrated to the stable_node chain + * of the current nid for this page + * content. + */ + VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + goto out; +} + +/* + * stable_tree_insert - insert stable tree node pointing to new ksm page + * into the stable tree. + * + * This function returns the stable tree node just allocated on success, + * NULL otherwise. + */ +static struct ksm_stable_node *stable_tree_insert(struct page *kpage) +{ + int nid; + unsigned long kpfn; + struct rb_root *root; + struct rb_node **new; + struct rb_node *parent; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; + bool need_chain = false; + + kpfn = page_to_pfn(kpage); + nid = get_kpfn_nid(kpfn); + root = root_stable_tree + nid; +again: + parent = NULL; + new = &root->rb_node; + + while (*new) { + struct page *tree_page; + int ret; + + cond_resched(); + stable_node = rb_entry(*new, struct ksm_stable_node, node); + stable_node_any = NULL; + tree_page = chain(&stable_node_dup, stable_node, root); + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * write protected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, + GET_KSM_PAGE_NOLOCK); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } + + ret = memcmp_pages(kpage, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + need_chain = true; + break; + } + } + + stable_node_dup = alloc_stable_node(); + if (!stable_node_dup) + return NULL; + + INIT_HLIST_HEAD(&stable_node_dup->hlist); + stable_node_dup->kpfn = kpfn; + set_page_stable_node(kpage, stable_node_dup); + stable_node_dup->rmap_hlist_len = 0; + DO_NUMA(stable_node_dup->nid = nid); + if (!need_chain) { + rb_link_node(&stable_node_dup->node, parent, new); + rb_insert_color(&stable_node_dup->node, root); + } else { + if (!is_stable_node_chain(stable_node)) { + struct ksm_stable_node *orig = stable_node; + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(orig, root); + if (!stable_node) { + free_stable_node(stable_node_dup); + return NULL; + } + } + stable_node_chain_add_dup(stable_node_dup, stable_node); + } + + return stable_node_dup; +} + +/* + * unstable_tree_search_insert - search for identical page, + * else insert rmap_item into the unstable tree. + * + * This function searches for a page in the unstable tree identical to the + * page currently being scanned; and if no identical page is found in the + * tree, we insert rmap_item as a new object into the unstable tree. + * + * This function returns pointer to rmap_item found to be identical + * to the currently scanned page, NULL otherwise. + * + * This function does both searching and inserting, because they share + * the same walking algorithm in an rbtree. + */ +static +struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, + struct page *page, + struct page **tree_pagep) +{ + struct rb_node **new; + struct rb_root *root; + struct rb_node *parent = NULL; + int nid; + + nid = get_kpfn_nid(page_to_pfn(page)); + root = root_unstable_tree + nid; + new = &root->rb_node; + + while (*new) { + struct ksm_rmap_item *tree_rmap_item; + struct page *tree_page; + int ret; + + cond_resched(); + tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); + tree_page = get_mergeable_page(tree_rmap_item); + if (!tree_page) + return NULL; + + /* + * Don't substitute a ksm page for a forked page. + */ + if (page == tree_page) { + put_page(tree_page); + return NULL; + } + + ret = memcmp_pages(page, tree_page); + + parent = *new; + if (ret < 0) { + put_page(tree_page); + new = &parent->rb_left; + } else if (ret > 0) { + put_page(tree_page); + new = &parent->rb_right; + } else if (!ksm_merge_across_nodes && + page_to_nid(tree_page) != nid) { + /* + * If tree_page has been migrated to another NUMA node, + * it will be flushed out and put in the right unstable + * tree next time: only merge with it when across_nodes. + */ + put_page(tree_page); + return NULL; + } else { + *tree_pagep = tree_page; + return tree_rmap_item; + } + } + + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + DO_NUMA(rmap_item->nid = nid); + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, root); + + ksm_pages_unshared++; + return NULL; +} + +/* + * stable_tree_append - add another rmap_item to the linked list of + * rmap_items hanging off a given node of the stable tree, all sharing + * the same ksm page. + */ +static void stable_tree_append(struct ksm_rmap_item *rmap_item, + struct ksm_stable_node *stable_node, + bool max_page_sharing_bypass) +{ + /* + * rmap won't find this mapping if we don't insert the + * rmap_item in the right stable_node + * duplicate. page_migration could break later if rmap breaks, + * so we can as well crash here. We really need to check for + * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check + * for other negative values as an underflow if detected here + * for the first time (and not when decreasing rmap_hlist_len) + * would be sign of memory corruption in the stable_node. + */ + BUG_ON(stable_node->rmap_hlist_len < 0); + + stable_node->rmap_hlist_len++; + if (!max_page_sharing_bypass) + /* possibly non fatal but unexpected overflow, only warn */ + WARN_ON_ONCE(stable_node->rmap_hlist_len > + ksm_max_page_sharing); + + rmap_item->head = stable_node; + rmap_item->address |= STABLE_FLAG; + hlist_add_head(&rmap_item->hlist, &stable_node->hlist); + + if (rmap_item->hlist.next) + ksm_pages_sharing++; + else + ksm_pages_shared++; + + rmap_item->mm->ksm_merging_pages++; +} + +/* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + struct ksm_rmap_item *tree_rmap_item; + struct page *tree_page = NULL; + struct ksm_stable_node *stable_node; + struct page *kpage; + unsigned int checksum; + int err; + bool max_page_sharing_bypass = false; + + stable_node = page_stable_node(page); + if (stable_node) { + if (stable_node->head != &migrate_nodes && + get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != + NUMA(stable_node->nid)) { + stable_node_dup_del(stable_node); + stable_node->head = &migrate_nodes; + list_add(&stable_node->list, stable_node->head); + } + if (stable_node->head != &migrate_nodes && + rmap_item->head == stable_node) + return; + /* + * If it's a KSM fork, allow it to go over the sharing limit + * without warnings. + */ + if (!is_page_sharing_candidate(stable_node)) + max_page_sharing_bypass = true; + } + + /* We first start with searching the page inside the stable tree */ + kpage = stable_tree_search(page); + if (kpage == page && rmap_item->head == stable_node) { + put_page(kpage); + return; + } + + remove_rmap_item_from_tree(rmap_item); + + if (kpage) { + if (PTR_ERR(kpage) == -EBUSY) + return; + + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); + if (!err) { + /* + * The page was successfully merged: + * add its rmap_item to the stable tree. + */ + lock_page(kpage); + stable_tree_append(rmap_item, page_stable_node(kpage), + max_page_sharing_bypass); + unlock_page(kpage); + } + put_page(kpage); + return; + } + + /* + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. + */ + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } + + /* + * Same checksum as an empty page. We attempt to merge it with the + * appropriate zero page if the user enabled this via sysfs. + */ + if (ksm_use_zero_pages && (checksum == zero_checksum)) { + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = find_mergeable_vma(mm, rmap_item->address); + if (vma) { + err = try_to_merge_one_page(vma, page, + ZERO_PAGE(rmap_item->address)); + } else { + /* + * If the vma is out of date, we do not need to + * continue. + */ + err = 0; + } + mmap_read_unlock(mm); + /* + * In case of failure, the page was not really empty, so we + * need to continue. Otherwise we're done. + */ + if (!err) + return; + } + tree_rmap_item = + unstable_tree_search_insert(rmap_item, page, &tree_page); + if (tree_rmap_item) { + bool split; + + kpage = try_to_merge_two_pages(rmap_item, page, + tree_rmap_item, tree_page); + /* + * If both pages we tried to merge belong to the same compound + * page, then we actually ended up increasing the reference + * count of the same compound page twice, and split_huge_page + * failed. + * Here we set a flag if that happened, and we use it later to + * try split_huge_page again. Since we call put_page right + * afterwards, the reference count will be correct and + * split_huge_page should succeed. + */ + split = PageTransCompound(page) + && compound_head(page) == compound_head(tree_page); + put_page(tree_page); + if (kpage) { + /* + * The pages were successfully merged: insert new + * node in the stable tree and add both rmap_items. + */ + lock_page(kpage); + stable_node = stable_tree_insert(kpage); + if (stable_node) { + stable_tree_append(tree_rmap_item, stable_node, + false); + stable_tree_append(rmap_item, stable_node, + false); + } + unlock_page(kpage); + + /* + * If we fail to insert the page into the stable tree, + * we will have 2 virtual addresses that are pointing + * to a ksm page left outside the stable tree, + * in which case we need to break_cow on both. + */ + if (!stable_node) { + break_cow(tree_rmap_item); + break_cow(rmap_item); + } + } else if (split) { + /* + * We are here if we tried to merge two pages and + * failed because they both belonged to the same + * compound page. We will split the page now, but no + * merging will take place. + * We do not want to add the cost of a full lock; if + * the page is locked, it is better to skip it and + * perhaps try again later. + */ + if (!trylock_page(page)) + return; + split_huge_page(page); + unlock_page(page); + } + } +} + +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, + unsigned long addr) +{ + struct ksm_rmap_item *rmap_item; + + while (*rmap_list) { + rmap_item = *rmap_list; + if ((rmap_item->address & PAGE_MASK) == addr) + return rmap_item; + if (rmap_item->address > addr) + break; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + } + + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ + rmap_item->mm = mm_slot->slot.mm; + rmap_item->mm->ksm_rmap_items++; + rmap_item->address = addr; + rmap_item->rmap_list = *rmap_list; + *rmap_list = rmap_item; + } + return rmap_item; +} + +static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; + struct vm_area_struct *vma; + struct ksm_rmap_item *rmap_item; + struct vma_iterator vmi; + int nid; + + if (list_empty(&ksm_mm_head.slot.mm_node)) + return NULL; + + mm_slot = ksm_scan.mm_slot; + if (mm_slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + + /* + * Whereas stale stable_nodes on the stable_tree itself + * get pruned in the regular course of stable_tree_search(), + * those moved out to the migrate_nodes list can accumulate: + * so prune them once before each full scan. + */ + if (!ksm_merge_across_nodes) { + struct ksm_stable_node *stable_node, *next; + struct page *page; + + list_for_each_entry_safe(stable_node, next, + &migrate_nodes, list) { + page = get_ksm_page(stable_node, + GET_KSM_PAGE_NOLOCK); + if (page) + put_page(page); + cond_resched(); + } + } + + for (nid = 0; nid < ksm_nr_node_ids; nid++) + root_unstable_tree[nid] = RB_ROOT; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + ksm_scan.mm_slot = mm_slot; + spin_unlock(&ksm_mmlist_lock); + /* + * Although we tested list_empty() above, a racing __ksm_exit + * of the last mm on the list may have removed it since then. + */ + if (mm_slot == &ksm_mm_head) + return NULL; +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_list = &mm_slot->rmap_list; + } + + slot = &mm_slot->slot; + mm = slot->mm; + vma_iter_init(&vmi, mm, ksm_scan.address); + + mmap_read_lock(mm); + if (ksm_test_exit(mm)) + goto no_vmas; + + for_each_vma(vmi, vma) { + if (!(vma->vm_flags & VM_MERGEABLE)) + continue; + if (ksm_scan.address < vma->vm_start) + ksm_scan.address = vma->vm_start; + if (!vma->anon_vma) + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; + *page = follow_page(vma, ksm_scan.address, FOLL_GET); + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (is_zone_device_page(*page)) + goto next_page; + if (PageAnon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); + rmap_item = get_next_rmap_item(mm_slot, + ksm_scan.rmap_list, ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_list = + &rmap_item->rmap_list; + ksm_scan.address += PAGE_SIZE; + } else + put_page(*page); + mmap_read_unlock(mm); + return rmap_item; + } +next_page: + put_page(*page); + ksm_scan.address += PAGE_SIZE; + cond_resched(); + } + } + + if (ksm_test_exit(mm)) { +no_vmas: + ksm_scan.address = 0; + ksm_scan.rmap_list = &mm_slot->rmap_list; + } + /* + * Nuke all the rmap_items that are above this current rmap: + * because there were no VM_MERGEABLE vmas with such addresses. + */ + remove_trailing_rmap_items(ksm_scan.rmap_list); + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_lock + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_lock then protects against race with MADV_MERGEABLE). + */ + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); + spin_unlock(&ksm_mmlist_lock); + + mm_slot_free(mm_slot_cache, mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmap_read_unlock(mm); + mmdrop(mm); + } else { + mmap_read_unlock(mm); + /* + * mmap_read_unlock(mm) first because after + * spin_unlock(&ksm_mmlist_lock) run, the "mm" may + * already have been freed under us by __ksm_exit() + * because the "mm_slot" is still hashed and + * ksm_scan.mm_slot doesn't point to it anymore. + */ + spin_unlock(&ksm_mmlist_lock); + } + + /* Repeat until we've completed scanning the whole list */ + mm_slot = ksm_scan.mm_slot; + if (mm_slot != &ksm_mm_head) + goto next_mm; + + ksm_scan.seqnr++; + return NULL; +} + +/** + * ksm_do_scan - the ksm scanner main worker function. + * @scan_npages: number of pages we want to scan before we return. + */ +static void ksm_do_scan(unsigned int scan_npages) +{ + struct ksm_rmap_item *rmap_item; + struct page *page; + + while (scan_npages-- && likely(!freezing(current))) { + cond_resched(); + rmap_item = scan_get_next_rmap_item(&page); + if (!rmap_item) + return; + cmp_and_merge_page(page, rmap_item); + put_page(page); + } +} + +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); +} + +static int ksm_scan_thread(void *nothing) +{ + unsigned int sleep_ms; + + set_freezable(); + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksmd_should_run()) + ksm_do_scan(ksm_thread_pages_to_scan); + mutex_unlock(&ksm_thread_mutex); + + try_to_freeze(); + + if (ksmd_should_run()) { + sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); + wait_event_interruptible_timeout(ksm_iter_wait, + sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), + msecs_to_jiffies(sleep_ms)); + } else { + wait_event_freezable(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_HUGETLB | VM_MIXEDMAP)) + return 0; /* just ignore the advice */ + + if (vma_is_dax(vma)) + return 0; + +#ifdef VM_SAO + if (*vm_flags & VM_SAO) + return 0; +#endif +#ifdef VM_SPARC_ADI + if (*vm_flags & VM_SPARC_ADI) + return 0; +#endif + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(ksm_madvise); + +int __ksm_enter(struct mm_struct *mm) +{ + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; + int needs_wakeup; + + mm_slot = mm_slot_alloc(mm_slot_cache); + if (!mm_slot) + return -ENOMEM; + + slot = &mm_slot->slot; + + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); + + spin_lock(&ksm_mmlist_lock); + mm_slot_insert(mm_slots_hash, mm, slot); + /* + * When KSM_RUN_MERGE (or KSM_RUN_STOP), + * insert just behind the scanning cursor, to let the area settle + * down a little; when fork is followed by immediate exec, we don't + * want ksmd to waste time setting up and tearing down an rmap_list. + * + * But when KSM_RUN_UNMERGE, it's important to insert ahead of its + * scanning cursor, otherwise KSM pages in newly forked mms will be + * missed: then we might as well insert at the end of the list. + */ + if (ksm_run & KSM_RUN_UNMERGE) + list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); + else + list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); + spin_unlock(&ksm_mmlist_lock); + + set_bit(MMF_VM_MERGEABLE, &mm->flags); + mmgrab(mm); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + + return 0; +} + +void __ksm_exit(struct mm_struct *mm) +{ + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; + int easy_to_free = 0; + + /* + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_lock to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. + */ + + spin_lock(&ksm_mmlist_lock); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (!mm_slot->rmap_list) { + hash_del(&slot->hash); + list_del(&slot->mm_node); + easy_to_free = 1; + } else { + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + mm_slot_free(mm_slot_cache, mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + mmap_write_lock(mm); + mmap_write_unlock(mm); + } +} + +struct page *ksm_might_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct folio *folio = page_folio(page); + struct anon_vma *anon_vma = folio_anon_vma(folio); + struct page *new_page; + + if (PageKsm(page)) { + if (page_stable_node(page) && + !(ksm_run & KSM_RUN_UNMERGE)) + return page; /* no need to copy it */ + } else if (!anon_vma) { + return page; /* no need to copy it */ + } else if (page->index == linear_page_index(vma, address) && + anon_vma->root == vma->anon_vma->root) { + return page; /* still no need to copy it */ + } + if (!PageUptodate(page)) + return page; /* let do_swap_page report the error */ + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page && + mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { + put_page(new_page); + new_page = NULL; + } + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + __SetPageLocked(new_page); +#ifdef CONFIG_SWAP + count_vm_event(KSM_SWPIN_COPY); +#endif + } + + return new_page; +} + +void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) +{ + struct ksm_stable_node *stable_node; + struct ksm_rmap_item *rmap_item; + int search_new_forks = 0; + + VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); + + /* + * Rely on the page lock to protect against concurrent modifications + * to that page's node of the stable tree. + */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + stable_node = folio_stable_node(folio); + if (!stable_node) + return; +again: + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + cond_resched(); + if (!anon_vma_trylock_read(anon_vma)) { + if (rwc->try_lock) { + rwc->contended = true; + return; + } + anon_vma_lock_read(anon_vma); + } + anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, + 0, ULONG_MAX) { + unsigned long addr; + + cond_resched(); + vma = vmac->vma; + + /* Ignore the stable/unstable/sqnr flags */ + addr = rmap_item->address & PAGE_MASK; + + if (addr < vma->vm_start || addr >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { + anon_vma_unlock_read(anon_vma); + return; + } + if (rwc->done && rwc->done(folio)) { + anon_vma_unlock_read(anon_vma); + return; + } + } + anon_vma_unlock_read(anon_vma); + } + if (!search_new_forks++) + goto again; +} + +#ifdef CONFIG_MIGRATION +void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) +{ + struct ksm_stable_node *stable_node; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); + VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); + + stable_node = folio_stable_node(folio); + if (stable_node) { + VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); + stable_node->kpfn = folio_pfn(newfolio); + /* + * newfolio->mapping was set in advance; now we need smp_wmb() + * to make sure that the new stable_node->kpfn is visible + * to get_ksm_page() before it can see that folio->mapping + * has gone stale (or that folio_test_swapcache has been cleared). + */ + smp_wmb(); + set_page_stable_node(&folio->page, NULL); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static void wait_while_offlining(void) +{ + while (ksm_run & KSM_RUN_OFFLINE) { + mutex_unlock(&ksm_thread_mutex); + wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), + TASK_UNINTERRUPTIBLE); + mutex_lock(&ksm_thread_mutex); + } +} + +static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn) +{ + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + return true; + } + return false; +} + +static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn, + struct rb_root *root) +{ + struct ksm_stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + return stable_node_dup_remove_range(stable_node, start_pfn, + end_pfn); + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + stable_node_dup_remove_range(dup, start_pfn, end_pfn); + } + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return true; /* notify caller that tree was rebalanced */ + } else + return false; +} + +static void ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct ksm_stable_node *stable_node, *next; + struct rb_node *node; + int nid; + + for (nid = 0; nid < ksm_nr_node_ids; nid++) { + node = rb_first(root_stable_tree + nid); + while (node) { + stable_node = rb_entry(node, struct ksm_stable_node, node); + if (stable_node_chain_remove_range(stable_node, + start_pfn, end_pfn, + root_stable_tree + + nid)) + node = rb_first(root_stable_tree + nid); + else + node = rb_next(node); + cond_resched(); + } + } + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + remove_node_from_stable_tree(stable_node); + cond_resched(); + } +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() + * and remove_all_stable_nodes() while memory is going offline: + * it is unsafe for them to touch the stable tree at this time. + * But unmerge_ksm_pages(), rmap lookups and other entry points + * which do not need the ksm_thread_mutex are all safe. + */ + mutex_lock(&ksm_thread_mutex); + ksm_run |= KSM_RUN_OFFLINE; + mutex_unlock(&ksm_thread_mutex); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree, + * otherwise get_ksm_page() might later try to access a + * non-existent struct page. + */ + ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages); + fallthrough; + case MEM_CANCEL_OFFLINE: + mutex_lock(&ksm_thread_mutex); + ksm_run &= ~KSM_RUN_OFFLINE; + mutex_unlock(&ksm_thread_mutex); + + smp_mb(); /* wake_up_bit advises this */ + wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); + break; + } + return NOTIFY_OK; +} +#else +static void wait_while_offlining(void) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define KSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define KSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RW(_name) + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int msecs; + int err; + + err = kstrtouint(buf, 10, &msecs); + if (err) + return -EINVAL; + + ksm_thread_sleep_millisecs = msecs; + wake_up_interruptible(&ksm_iter_wait); + + return count; +} +KSM_ATTR(sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); +} + +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int nr_pages; + int err; + + err = kstrtouint(buf, 10, &nr_pages); + if (err) + return -EINVAL; + + ksm_thread_pages_to_scan = nr_pages; + + return count; +} +KSM_ATTR(pages_to_scan); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int flags; + int err; + + err = kstrtouint(buf, 10, &flags); + if (err) + return -EINVAL; + if (flags > KSM_RUN_UNMERGE) + return -EINVAL; + + /* + * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. + * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). + */ + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_run != flags) { + ksm_run = flags; + if (flags & KSM_RUN_UNMERGE) { + set_current_oom_origin(); + err = unmerge_and_remove_all_rmap_items(); + clear_current_oom_origin(); + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } + } + mutex_unlock(&ksm_thread_mutex); + + if (flags & KSM_RUN_MERGE) + wake_up_interruptible(&ksm_thread_wait); + + return count; +} +KSM_ATTR(run); + +#ifdef CONFIG_NUMA +static ssize_t merge_across_nodes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); +} + +static ssize_t merge_across_nodes_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long knob; + + err = kstrtoul(buf, 10, &knob); + if (err) + return err; + if (knob > 1) + return -EINVAL; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_merge_across_nodes != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else if (root_stable_tree == one_stable_tree) { + struct rb_root *buf; + /* + * This is the first time that we switch away from the + * default of merging across nodes: must now allocate + * a buffer to hold as many roots as may be needed. + * Allocate stable and unstable together: + * MAXSMP NODES_SHIFT 10 will use 16kB. + */ + buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), + GFP_KERNEL); + /* Let us assume that RB_ROOT is NULL is zero */ + if (!buf) + err = -ENOMEM; + else { + root_stable_tree = buf; + root_unstable_tree = buf + nr_node_ids; + /* Stable tree is empty but not the unstable */ + root_unstable_tree[0] = one_unstable_tree[0]; + } + } + if (!err) { + ksm_merge_across_nodes = knob; + ksm_nr_node_ids = knob ? 1 : nr_node_ids; + } + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(merge_across_nodes); +#endif + +static ssize_t use_zero_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); +} +static ssize_t use_zero_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return -EINVAL; + + ksm_use_zero_pages = value; + + return count; +} +KSM_ATTR(use_zero_pages); + +static ssize_t max_page_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); +} + +static ssize_t max_page_sharing_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + int knob; + + err = kstrtoint(buf, 10, &knob); + if (err) + return err; + /* + * When a KSM page is created it is shared by 2 mappings. This + * being a signed comparison, it implicitly verifies it's not + * negative. + */ + if (knob < 2) + return -EINVAL; + + if (READ_ONCE(ksm_max_page_sharing) == knob) + return count; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_max_page_sharing != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else + ksm_max_page_sharing = knob; + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(max_page_sharing); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_shared); +} +KSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); +} +KSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t stable_node_dups_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); +} +KSM_ATTR_RO(stable_node_dups); + +static ssize_t stable_node_chains_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); +} +KSM_ATTR_RO(stable_node_chains); + +static ssize_t +stable_node_chains_prune_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); +} + +static ssize_t +stable_node_chains_prune_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int msecs; + int err; + + err = kstrtouint(buf, 10, &msecs); + if (err) + return -EINVAL; + + ksm_stable_node_chains_prune_millisecs = msecs; + + return count; +} +KSM_ATTR(stable_node_chains_prune_millisecs); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + +static struct attribute *ksm_attrs[] = { + &sleep_millisecs_attr.attr, + &pages_to_scan_attr.attr, + &run_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, +#ifdef CONFIG_NUMA + &merge_across_nodes_attr.attr, +#endif + &max_page_sharing_attr.attr, + &stable_node_chains_attr.attr, + &stable_node_dups_attr.attr, + &stable_node_chains_prune_millisecs_attr.attr, + &use_zero_pages_attr.attr, + NULL, +}; + +static const struct attribute_group ksm_attr_group = { + .attrs = ksm_attrs, + .name = "ksm", +}; +#endif /* CONFIG_SYSFS */ + +static int __init ksm_init(void) +{ + struct task_struct *ksm_thread; + int err; + + /* The correct value depends on page size and endianness */ + zero_checksum = calc_checksum(ZERO_PAGE(0)); + /* Default to false for backwards compatibility */ + ksm_use_zero_pages = false; + + err = ksm_slab_init(); + if (err) + goto out; + + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + if (IS_ERR(ksm_thread)) { + pr_err("ksm: creating kthread failed\n"); + err = PTR_ERR(ksm_thread); + goto out_free; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &ksm_attr_group); + if (err) { + pr_err("ksm: register sysfs failed\n"); + kthread_stop(ksm_thread); + goto out_free; + } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MEMORY_HOTREMOVE + /* There is no significance to this priority 100 */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif + return 0; + +out_free: + ksm_slab_free(); +out: + return err; +} +subsys_initcall(ksm_init); diff --git a/mm/list_lru.c b/mm/list_lru.c new file mode 100644 index 000000000..a05e5bef3 --- /dev/null +++ b/mm/list_lru.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. + * Authors: David Chinner and Glauber Costa + * + * Generic LRU infrastructure + */ +#include +#include +#include +#include +#include +#include +#include +#include "slab.h" +#include "internal.h" + +#ifdef CONFIG_MEMCG_KMEM +static LIST_HEAD(memcg_list_lrus); +static DEFINE_MUTEX(list_lrus_mutex); + +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return lru->memcg_aware; +} + +static void list_lru_register(struct list_lru *lru) +{ + if (!list_lru_memcg_aware(lru)) + return; + + mutex_lock(&list_lrus_mutex); + list_add(&lru->list, &memcg_list_lrus); + mutex_unlock(&list_lrus_mutex); +} + +static void list_lru_unregister(struct list_lru *lru) +{ + if (!list_lru_memcg_aware(lru)) + return; + + mutex_lock(&list_lrus_mutex); + list_del(&lru->list); + mutex_unlock(&list_lrus_mutex); +} + +static int lru_shrinker_id(struct list_lru *lru) +{ + return lru->shrinker_id; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) +{ + if (list_lru_memcg_aware(lru) && idx >= 0) { + struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); + + return mlru ? &mlru->node[nid] : NULL; + } + return &lru->node[nid].lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, + struct mem_cgroup **memcg_ptr) +{ + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l = &nlru->lru; + struct mem_cgroup *memcg = NULL; + + if (!list_lru_memcg_aware(lru)) + goto out; + + memcg = mem_cgroup_from_slab_obj(ptr); + if (!memcg) + goto out; + + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); +out: + if (memcg_ptr) + *memcg_ptr = memcg; + return l; +} +#else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} + +static int lru_shrinker_id(struct list_lru *lru) +{ + return -1; +} + +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return false; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) +{ + return &lru->node[nid].lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, + struct mem_cgroup **memcg_ptr) +{ + if (memcg_ptr) + *memcg_ptr = NULL; + return &lru->node[nid].lru; +} +#endif /* CONFIG_MEMCG_KMEM */ + +bool list_lru_add(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct list_lru_node *nlru = &lru->node[nid]; + struct mem_cgroup *memcg; + struct list_lru_one *l; + + spin_lock(&nlru->lock); + if (list_empty(item)) { + l = list_lru_from_kmem(lru, nid, item, &memcg); + list_add_tail(item, &l->list); + /* Set shrinker bit if the first element was added */ + if (!l->nr_items++) + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); + nlru->nr_items++; + spin_unlock(&nlru->lock); + return true; + } + spin_unlock(&nlru->lock); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_add); + +bool list_lru_del(struct list_lru *lru, struct list_head *item) +{ + int nid = page_to_nid(virt_to_page(item)); + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + + spin_lock(&nlru->lock); + if (!list_empty(item)) { + l = list_lru_from_kmem(lru, nid, item, NULL); + list_del_init(item); + l->nr_items--; + nlru->nr_items--; + spin_unlock(&nlru->lock); + return true; + } + spin_unlock(&nlru->lock); + return false; +} +EXPORT_SYMBOL_GPL(list_lru_del); + +void list_lru_isolate(struct list_lru_one *list, struct list_head *item) +{ + list_del_init(item); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate); + +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head) +{ + list_move(item, head); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate_move); + +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) +{ + struct list_lru_one *l; + long count; + + rcu_read_lock(); + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + count = l ? READ_ONCE(l->nr_items) : 0; + rcu_read_unlock(); + + if (unlikely(count < 0)) + count = 0; + + return count; +} +EXPORT_SYMBOL_GPL(list_lru_count_one); + +unsigned long list_lru_count_node(struct list_lru *lru, int nid) +{ + struct list_lru_node *nlru; + + nlru = &lru->node[nid]; + return nlru->nr_items; +} +EXPORT_SYMBOL_GPL(list_lru_count_node); + +static unsigned long +__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + struct list_head *item, *n; + unsigned long isolated = 0; + +restart: + l = list_lru_from_memcg_idx(lru, nid, memcg_idx); + if (!l) + goto out; + + list_for_each_safe(item, n, &l->list) { + enum lru_status ret; + + /* + * decrement nr_to_walk first so that we don't livelock if we + * get stuck on large numbers of LRU_RETRY items + */ + if (!*nr_to_walk) + break; + --*nr_to_walk; + + ret = isolate(item, l, &nlru->lock, cb_arg); + switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); + fallthrough; + case LRU_REMOVED: + isolated++; + nlru->nr_items--; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; + break; + case LRU_ROTATE: + list_move_tail(item, &l->list); + break; + case LRU_SKIP: + break; + case LRU_RETRY: + /* + * The lru lock has been dropped, our list traversal is + * now invalid and so we have to restart from scratch. + */ + assert_spin_locked(&nlru->lock); + goto restart; + default: + BUG(); + } + } +out: + return isolated; +} + +unsigned long +list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock(&nlru->lock); + ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, + cb_arg, nr_to_walk); + spin_unlock(&nlru->lock); + return ret; +} +EXPORT_SYMBOL_GPL(list_lru_walk_one); + +unsigned long +list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + struct list_lru_node *nlru = &lru->node[nid]; + unsigned long ret; + + spin_lock_irq(&nlru->lock); + ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, + cb_arg, nr_to_walk); + spin_unlock_irq(&nlru->lock); + return ret; +} + +unsigned long list_lru_walk_node(struct list_lru *lru, int nid, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + long isolated = 0; + + isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, + nr_to_walk); + +#ifdef CONFIG_MEMCG_KMEM + if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { + struct list_lru_memcg *mlru; + unsigned long index; + + xa_for_each(&lru->xa, index, mlru) { + struct list_lru_node *nlru = &lru->node[nid]; + + spin_lock(&nlru->lock); + isolated += __list_lru_walk_one(lru, nid, index, + isolate, cb_arg, + nr_to_walk); + spin_unlock(&nlru->lock); + + if (*nr_to_walk <= 0) + break; + } + } +#endif + + return isolated; +} +EXPORT_SYMBOL_GPL(list_lru_walk_node); + +static void init_one_lru(struct list_lru_one *l) +{ + INIT_LIST_HEAD(&l->list); + l->nr_items = 0; +} + +#ifdef CONFIG_MEMCG_KMEM +static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) +{ + int nid; + struct list_lru_memcg *mlru; + + mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); + if (!mlru) + return NULL; + + for_each_node(nid) + init_one_lru(&mlru->node[nid]); + + return mlru; +} + +static void memcg_list_lru_free(struct list_lru *lru, int src_idx) +{ + struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); + + /* + * The __list_lru_walk_one() can walk the list of this node. + * We need kvfree_rcu() here. And the walking of the list + * is under lru->node[nid]->lock, which can serve as a RCU + * read-side critical section. + */ + if (mlru) + kvfree_rcu(mlru, rcu); +} + +static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + if (memcg_aware) + xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); + lru->memcg_aware = memcg_aware; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ + XA_STATE(xas, &lru->xa, 0); + struct list_lru_memcg *mlru; + + if (!list_lru_memcg_aware(lru)) + return; + + xas_lock_irq(&xas); + xas_for_each(&xas, mlru, ULONG_MAX) { + kfree(mlru); + xas_store(&xas, NULL); + } + xas_unlock_irq(&xas); +} + +static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, + int src_idx, struct mem_cgroup *dst_memcg) +{ + struct list_lru_node *nlru = &lru->node[nid]; + int dst_idx = dst_memcg->kmemcg_id; + struct list_lru_one *src, *dst; + + /* + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + + src = list_lru_from_memcg_idx(lru, nid, src_idx); + if (!src) + goto out; + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); + + list_splice_init(&src->list, &dst->list); + + if (src->nr_items) { + dst->nr_items += src->nr_items; + set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); + src->nr_items = 0; + } +out: + spin_unlock_irq(&nlru->lock); +} + +static void memcg_reparent_list_lru(struct list_lru *lru, + int src_idx, struct mem_cgroup *dst_memcg) +{ + int i; + + for_each_node(i) + memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); + + memcg_list_lru_free(lru, src_idx); +} + +void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + struct cgroup_subsys_state *css; + struct list_lru *lru; + int src_idx = memcg->kmemcg_id; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. + * + * After we have finished, all list_lrus corresponding to this cgroup + * are guaranteed to remain empty. So we can safely free this cgroup's + * list lrus in memcg_list_lru_free(). + * + * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() + * from allocating list lrus for this cgroup after memcg_list_lru_free() + * call. + */ + rcu_read_lock(); + css_for_each_descendant_pre(css, &memcg->css) { + struct mem_cgroup *child; + + child = mem_cgroup_from_css(css); + WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); + } + rcu_read_unlock(); + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &memcg_list_lrus, list) + memcg_reparent_list_lru(lru, src_idx, parent); + mutex_unlock(&list_lrus_mutex); +} + +static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, + struct list_lru *lru) +{ + int idx = memcg->kmemcg_id; + + return idx < 0 || xa_load(&lru->xa, idx); +} + +int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, + gfp_t gfp) +{ + int i; + unsigned long flags; + struct list_lru_memcg_table { + struct list_lru_memcg *mlru; + struct mem_cgroup *memcg; + } *table; + XA_STATE(xas, &lru->xa, 0); + + if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) + return 0; + + gfp &= GFP_RECLAIM_MASK; + table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); + if (!table) + return -ENOMEM; + + /* + * Because the list_lru can be reparented to the parent cgroup's + * list_lru, we should make sure that this cgroup and all its + * ancestors have allocated list_lru_memcg. + */ + for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { + if (memcg_list_lru_allocated(memcg, lru)) + break; + + table[i].memcg = memcg; + table[i].mlru = memcg_init_list_lru_one(gfp); + if (!table[i].mlru) { + while (i--) + kfree(table[i].mlru); + kfree(table); + return -ENOMEM; + } + } + + xas_lock_irqsave(&xas, flags); + while (i--) { + int index = READ_ONCE(table[i].memcg->kmemcg_id); + struct list_lru_memcg *mlru = table[i].mlru; + + xas_set(&xas, index); +retry: + if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { + kfree(mlru); + } else { + xas_store(&xas, mlru); + if (xas_error(&xas) == -ENOMEM) { + xas_unlock_irqrestore(&xas, flags); + if (xas_nomem(&xas, gfp)) + xas_set_err(&xas, 0); + xas_lock_irqsave(&xas, flags); + /* + * The xas lock has been released, this memcg + * can be reparented before us. So reload + * memcg id. More details see the comments + * in memcg_reparent_list_lrus(). + */ + index = READ_ONCE(table[i].memcg->kmemcg_id); + if (index < 0) + xas_set_err(&xas, 0); + else if (!xas_error(&xas) && index != xas.xa_index) + xas_set(&xas, index); + goto retry; + } + } + } + /* xas_nomem() is used to free memory instead of memory allocation. */ + if (xas.xa_alloc) + xas_nomem(&xas, gfp); + xas_unlock_irqrestore(&xas, flags); + kfree(table); + + return xas_error(&xas); +} +#else +static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key, struct shrinker *shrinker) +{ + int i; + +#ifdef CONFIG_MEMCG_KMEM + if (shrinker) + lru->shrinker_id = shrinker->id; + else + lru->shrinker_id = -1; +#endif + + lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); + if (!lru->node) + return -ENOMEM; + + for_each_node(i) { + spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); + init_one_lru(&lru->node[i].lru); + } + + memcg_init_list_lru(lru, memcg_aware); + list_lru_register(lru); + + return 0; +} +EXPORT_SYMBOL_GPL(__list_lru_init); + +void list_lru_destroy(struct list_lru *lru) +{ + /* Already destroyed or not yet initialized? */ + if (!lru->node) + return; + + list_lru_unregister(lru); + + memcg_destroy_list_lru(lru); + kfree(lru->node); + lru->node = NULL; + +#ifdef CONFIG_MEMCG_KMEM + lru->shrinker_id = -1; +#endif +} +EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/maccess.c b/mm/maccess.c new file mode 100644 index 000000000..518a25667 --- /dev/null +++ b/mm/maccess.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Access kernel or user memory without faulting. + */ +#include +#include +#include +#include + +bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, + size_t size) +{ + return true; +} + +#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ + while (len >= sizeof(type)) { \ + __get_kernel_nofault(dst, src, type, err_label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +long copy_from_kernel_nofault(void *dst, const void *src, size_t size) +{ + unsigned long align = 0; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + align = (unsigned long)dst | (unsigned long)src; + + if (!copy_from_kernel_nofault_allowed(src, size)) + return -ERANGE; + + pagefault_disable(); + if (!(align & 7)) + copy_from_kernel_nofault_loop(dst, src, size, u64, Efault); + if (!(align & 3)) + copy_from_kernel_nofault_loop(dst, src, size, u32, Efault); + if (!(align & 1)) + copy_from_kernel_nofault_loop(dst, src, size, u16, Efault); + copy_from_kernel_nofault_loop(dst, src, size, u8, Efault); + pagefault_enable(); + return 0; +Efault: + pagefault_enable(); + return -EFAULT; +} +EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); + +#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ + while (len >= sizeof(type)) { \ + __put_kernel_nofault(dst, src, type, err_label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +long copy_to_kernel_nofault(void *dst, const void *src, size_t size) +{ + unsigned long align = 0; + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) + align = (unsigned long)dst | (unsigned long)src; + + pagefault_disable(); + if (!(align & 7)) + copy_to_kernel_nofault_loop(dst, src, size, u64, Efault); + if (!(align & 3)) + copy_to_kernel_nofault_loop(dst, src, size, u32, Efault); + if (!(align & 1)) + copy_to_kernel_nofault_loop(dst, src, size, u16, Efault); + copy_to_kernel_nofault_loop(dst, src, size, u8, Efault); + pagefault_enable(); + return 0; +Efault: + pagefault_enable(); + return -EFAULT; +} + +long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) +{ + const void *src = unsafe_addr; + + if (unlikely(count <= 0)) + return 0; + if (!copy_from_kernel_nofault_allowed(unsafe_addr, count)) + return -ERANGE; + + pagefault_disable(); + do { + __get_kernel_nofault(dst, src, u8, Efault); + dst++; + src++; + } while (dst[-1] && src - unsafe_addr < count); + pagefault_enable(); + + dst[-1] = '\0'; + return src - unsafe_addr; +Efault: + pagefault_enable(); + dst[0] = '\0'; + return -EFAULT; +} + +/** + * copy_from_user_nofault(): safely attempt to read from a user-space location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from. This must be a user address. + * @size: size of the data chunk + * + * Safely read from user address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long copy_from_user_nofault(void *dst, const void __user *src, size_t size) +{ + long ret = -EFAULT; + + if (!__access_ok(src, size)) + return ret; + + if (!nmi_uaccess_okay()) + return ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + + if (ret) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(copy_from_user_nofault); + +/** + * copy_to_user_nofault(): safely attempt to write to a user-space location + * @dst: address to write to + * @src: pointer to the data that shall be written + * @size: size of the data chunk + * + * Safely write to address @dst from the buffer at @src. If a kernel fault + * happens, handle that and return -EFAULT. + */ +long copy_to_user_nofault(void __user *dst, const void *src, size_t size) +{ + long ret = -EFAULT; + + if (access_ok(dst, size)) { + pagefault_disable(); + ret = __copy_to_user_inatomic(dst, src, size); + pagefault_enable(); + } + + if (ret) + return -EFAULT; + return 0; +} +EXPORT_SYMBOL_GPL(copy_to_user_nofault); + +/** + * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user + * address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @unsafe_addr: Unsafe user address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe user address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, + long count) +{ + long ret; + + if (unlikely(count <= 0)) + return 0; + + pagefault_disable(); + ret = strncpy_from_user(dst, unsafe_addr, count); + pagefault_enable(); + + if (ret >= count) { + ret = count; + dst[ret - 1] = '\0'; + } else if (ret > 0) { + ret++; + } + + return ret; +} + +/** + * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL. + * @unsafe_addr: The string to measure. + * @count: Maximum count (including NUL) + * + * Get the size of a NUL-terminated string in user space without pagefault. + * + * Returns the size of the string INCLUDING the terminating NUL. + * + * If the string is too long, returns a number larger than @count. User + * has to check the return value against "> count". + * On exception (or invalid count), returns 0. + * + * Unlike strnlen_user, this can be used from IRQ handler etc. because + * it disables pagefaults. + */ +long strnlen_user_nofault(const void __user *unsafe_addr, long count) +{ + int ret; + + pagefault_disable(); + ret = strnlen_user(unsafe_addr, count); + pagefault_enable(); + + return ret; +} + +void __copy_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} +EXPORT_SYMBOL(__copy_overflow); diff --git a/mm/madvise.c b/mm/madvise.c new file mode 100644 index 000000000..5973399b2 --- /dev/null +++ b/mm/madvise.c @@ -0,0 +1,1514 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/madvise.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 2002 Christoph Hellwig + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" +#include "swap.h" + +struct madvise_walk_private { + struct mmu_gather *tlb; + bool pageout; +}; + +/* + * Any behaviour which results in changes to the vma->vm_flags needs to + * take mmap_lock for writing. Others, which simply traverse vmas, need + * to only take it for reading. + */ +static int madvise_need_mmap_write(int behavior) +{ + switch (behavior) { + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_FREE: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: + return 0; + default: + /* be safe, default to 1. list exceptions explicitly */ + return 1; + } +} + +#ifdef CONFIG_ANON_VMA_NAME +struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t count; + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); + } + + return anon_name; +} + +void anon_vma_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + mmap_assert_locked(vma->vm_mm); + + if (vma->vm_file) + return NULL; + + return vma->anon_name; +} + +/* mmap_lock should be write-locked */ +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) +{ + struct anon_vma_name *orig_name = anon_vma_name(vma); + + if (!anon_name) { + vma->anon_name = NULL; + anon_vma_name_put(orig_name); + return 0; + } + + if (anon_vma_name_eq(orig_name, anon_name)) + return 0; + + vma->anon_name = anon_vma_name_reuse(anon_name); + anon_vma_name_put(orig_name); + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_anon_vma_name(struct vm_area_struct *vma, + struct anon_vma_name *anon_name) +{ + if (anon_name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ +/* + * Update the vm_flags on region of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; + * Caller should ensure anon_name stability by raising its refcount even when + * anon_name belongs to a valid vma because this function might free that vma. + */ +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags, + struct anon_vma_name *anon_name) +{ + struct mm_struct *mm = vma->vm_mm; + int error; + pgoff_t pgoff; + + if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { + *prev = vma; + return 0; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_name); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; + error = __split_vma(mm, vma, start, 1); + if (error) + return error; + } + + if (end != vma->vm_end) { + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; + error = __split_vma(mm, vma, end, 0); + if (error) + return error; + } + +success: + /* + * vm_flags is protected by the mmap_lock held in write mode. + */ + vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_anon_vma_name(vma, anon_name); + if (error) + return error; + } + + return 0; +} + +#ifdef CONFIG_SWAP +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + unsigned long index; + struct swap_iocb *splug = NULL; + + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + return 0; + + for (index = start; index != end; index += PAGE_SIZE) { + pte_t pte; + swp_entry_t entry; + struct page *page; + spinlock_t *ptl; + pte_t *ptep; + + ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl); + pte = *ptep; + pte_unmap_unlock(ptep, ptl); + + if (!is_swap_pte(pte)) + continue; + entry = pte_to_swp_entry(pte); + if (unlikely(non_swap_entry(entry))) + continue; + + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + vma, index, false, &splug); + if (page) + put_page(page); + } + swap_read_unplug(splug); + + return 0; +} + +static const struct mm_walk_ops swapin_walk_ops = { + .pmd_entry = swapin_walk_pmd_entry, +}; + +static void force_shm_swapin_readahead(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct address_space *mapping) +{ + XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); + pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1); + struct page *page; + struct swap_iocb *splug = NULL; + + rcu_read_lock(); + xas_for_each(&xas, page, end_index) { + swp_entry_t swap; + + if (!xa_is_value(page)) + continue; + swap = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swap)) + continue; + xas_pause(&xas); + rcu_read_unlock(); + + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, + NULL, 0, false, &splug); + if (page) + put_page(page); + + rcu_read_lock(); + } + rcu_read_unlock(); + swap_read_unplug(splug); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +} +#endif /* CONFIG_SWAP */ + +/* + * Schedule all required I/O operations. Do not wait for completion. + */ +static long madvise_willneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + struct file *file = vma->vm_file; + loff_t offset; + + *prev = vma; +#ifdef CONFIG_SWAP + if (!file) { + walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); + lru_add_drain(); /* Push any new pages onto the LRU now */ + return 0; + } + + if (shmem_mapping(file->f_mapping)) { + force_shm_swapin_readahead(vma, start, end, + file->f_mapping); + return 0; + } +#else + if (!file) + return -EBADF; +#endif + + if (IS_DAX(file_inode(file))) { + /* no bad return value, but ignore advice */ + return 0; + } + + /* + * Filesystem's fadvise may need to take various locks. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_lock. + */ + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + get_file(file); + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + mmap_read_unlock(mm); + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); + fput(file); + mmap_read_lock(mm); + return 0; +} + +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct madvise_walk_private *private = walk->private; + struct mmu_gather *tlb = private->tlb; + bool pageout = private->pageout; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + pte_t *orig_pte, *pte, ptent; + spinlock_t *ptl; + struct page *page = NULL; + LIST_HEAD(page_list); + + if (fatal_signal_pending(current)) + return -EINTR; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t orig_pmd; + unsigned long next = pmd_addr_end(addr, end); + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) + goto huge_unlock; + + if (unlikely(!pmd_present(orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(orig_pmd)); + goto huge_unlock; + } + + page = pmd_page(orig_pmd); + + /* Do not interfere with other mappings of this page */ + if (page_mapcount(page) != 1) + goto huge_unlock; + + if (next - addr != HPAGE_PMD_SIZE) { + int err; + + get_page(page); + spin_unlock(ptl); + lock_page(page); + err = split_huge_page(page); + unlock_page(page); + put_page(page); + if (!err) + goto regular_page; + return 0; + } + + if (pmd_young(orig_pmd)) { + pmdp_invalidate(vma, addr, pmd); + orig_pmd = pmd_mkold(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } + } else + deactivate_page(page); +huge_unlock: + spin_unlock(ptl); + if (pageout) + reclaim_pages(&page_list); + return 0; + } + +regular_page: + if (pmd_trans_unstable(pmd)) + return 0; +#endif + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr < end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page || is_zone_device_page(page)) + continue; + + /* + * Creating a THP page is expensive so split it only if we + * are sure it's worth. Split it if we are only owner. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + break; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + break; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + break; + } + unlock_page(page); + put_page(page); + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + /* + * Do not interfere with other mappings of this page and + * non-LRU page. + */ + if (!PageLRU(page) || page_mapcount(page) != 1) + continue; + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (pte_young(ptent)) { + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + ptent = pte_mkold(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + + /* + * We are deactivating a page for accelerating reclaiming. + * VM couldn't reclaim the page unless we clear PG_young. + * As a side effect, it makes confuse idle-page tracking + * because they will miss recent referenced history. + */ + ClearPageReferenced(page); + test_and_clear_page_young(page); + if (pageout) { + if (!isolate_lru_page(page)) { + if (PageUnevictable(page)) + putback_lru_page(page); + else + list_add(&page->lru, &page_list); + } + } else + deactivate_page(page); + } + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + if (pageout) + reclaim_pages(&page_list); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops cold_walk_ops = { + .pmd_entry = madvise_cold_or_pageout_pte_range, +}; + +static void madvise_cold_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = false, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static inline bool can_madv_lru_vma(struct vm_area_struct *vma) +{ + return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); +} + +static long madvise_cold(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + madvise_cold_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb); + + return 0; +} + +static void madvise_pageout_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct madvise_walk_private walk_private = { + .pageout = true, + .tlb = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); + tlb_end_vma(tlb, vma); +} + +static inline bool can_do_pageout(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; +} + +static long madvise_pageout(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + *prev = vma; + if (!can_madv_lru_vma(vma)) + return -EINVAL; + + if (!can_do_pageout(vma)) + return 0; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); + tlb_finish_mmu(&tlb); + + return 0; +} + +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte, *pte, ptent; + struct folio *folio; + struct page *page; + int nr_swap = 0; + unsigned long next; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) + if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) + goto next; + + if (pmd_trans_unstable(pmd)) + return 0; + + tlb_change_page_size(tlb, PAGE_SIZE); + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + /* + * If the pte has swp_entry, just clear page table to + * prevent swap-in which is more expensive rather than + * (page allocation + zeroing). + */ + if (!pte_present(ptent)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(ptent); + if (!non_swap_entry(entry)) { + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + } + continue; + } + + page = vm_normal_page(vma, addr, ptent); + if (!page || is_zone_device_page(page)) + continue; + folio = page_folio(page); + + /* + * If pmd isn't transhuge but the folio is large and + * is owned by only this process, split it and + * deactivate all pages. + */ + if (folio_test_large(folio)) { + if (folio_estimated_sharers(folio) != 1) + break; + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); + goto out; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); + orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + goto out; + } + folio_unlock(folio); + folio_put(folio); + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { + if (!folio_trylock(folio)) + continue; + /* + * If folio is shared with others, we mustn't clear + * the folio's dirty flag. + */ + if (folio_mapcount(folio) != 1) { + folio_unlock(folio); + continue; + } + + if (folio_test_swapcache(folio) && + !folio_free_swap(folio)) { + folio_unlock(folio); + continue; + } + + folio_clear_dirty(folio); + folio_unlock(folio); + } + + if (pte_young(ptent) || pte_dirty(ptent)) { + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + tlb_remove_tlb_entry(tlb, pte, addr); + } + mark_page_lazyfree(&folio->page); + } +out: + if (nr_swap) { + if (current->mm == mm) + sync_mm_rss(mm); + + add_mm_counter(mm, MM_SWAPENTS, nr_swap); + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); +next: + return 0; +} + +static const struct mm_walk_ops madvise_free_walk_ops = { + .pmd_entry = madvise_free_pte_range, +}; + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + struct mmu_gather tlb; + + /* MADV_FREE works for only anon vma at the moment */ + if (!vma_is_anonymous(vma)) + return -EINVAL; + + range.start = max(vma->vm_start, start_addr); + if (range.start >= vma->vm_end) + return -EINVAL; + range.end = min(vma->vm_end, end_addr); + if (range.end <= vma->vm_start) + return -EINVAL; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + range.start, range.end); + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(&range); + tlb_start_vma(&tlb, vma); + walk_page_range(vma->vm_mm, range.start, range.end, + &madvise_free_walk_ops, &tlb); + tlb_end_vma(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); + + return 0; +} + +/* + * Application no longer needs these pages. If the pages are dirty, + * it's OK to just throw them away. The app will be more careful about + * data it wants to keep. Be sure to free swap resources too. The + * zap_page_range_single call sets things up for shrink_active_list to actually + * free these pages later if no one else has touched them in the meantime, + * although we could add these pages to a global reuse list for + * shrink_active_list to pick up before reclaiming other pages. + * + * NB: This interface discards data rather than pushes it out to swap, + * as some implementations do. This has performance implications for + * applications like large transactional databases which want to discard + * pages in anonymous maps after committing to backing store the data + * that was kept in them. There is no reason to write this data out to + * the swap area if the application is discarding it. + * + * An interface that causes the system to free clean pages and flush + * dirty pages is already available as msync(MS_INVALIDATE). + */ +static long madvise_dontneed_single_vma(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + zap_page_range_single(vma, start, end - start, NULL); + return 0; +} + +static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, + unsigned long start, + unsigned long *end, + int behavior) +{ + if (!is_vm_hugetlb_page(vma)) { + unsigned int forbidden = VM_PFNMAP; + + if (behavior != MADV_DONTNEED_LOCKED) + forbidden |= VM_LOCKED; + + return !(vma->vm_flags & forbidden); + } + + if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) + return false; + if (start & ~huge_page_mask(hstate_vma(vma))) + return false; + + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + + return true; +} + +static long madvise_dontneed_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + int behavior) +{ + struct mm_struct *mm = vma->vm_mm; + + *prev = vma; + if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) + return -EINVAL; + + if (start == end) + return 0; + + if (!userfaultfd_remove(vma, start, end)) { + *prev = NULL; /* mmap_lock has been dropped, prev is stale */ + + mmap_read_lock(mm); + vma = find_vma(mm, start); + if (!vma) + return -ENOMEM; + if (start < vma->vm_start) { + /* + * This "vma" under revalidation is the one + * with the lowest vma->vm_start where start + * is also < vma->vm_end. If start < + * vma->vm_start it means an hole materialized + * in the user address space within the + * virtual range passed to MADV_DONTNEED + * or MADV_FREE. + */ + return -ENOMEM; + } + /* + * Potential end adjustment for hugetlb vma is OK as + * the check below keeps end within vma. + */ + if (!madvise_dontneed_free_valid_vma(vma, start, &end, + behavior)) + return -EINVAL; + if (end > vma->vm_end) { + /* + * Don't fail if end > vma->vm_end. If the old + * vma was split while the mmap_lock was + * released the effect of the concurrent + * operation may not cause madvise() to + * have an undefined result. There may be an + * adjacent next vma that we'll walk + * next. userfaultfd_remove() will generate an + * UFFD_EVENT_REMOVE repetition on the + * end-vma->vm_end range, but the manager can + * handle a repetition fine. + */ + end = vma->vm_end; + } + VM_WARN_ON(start >= end); + } + + if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) + return madvise_dontneed_single_vma(vma, start, end); + else if (behavior == MADV_FREE) + return madvise_free_single_vma(vma, start, end); + else + return -EINVAL; +} + +static long madvise_populate(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + int behavior) +{ + const bool write = behavior == MADV_POPULATE_WRITE; + struct mm_struct *mm = vma->vm_mm; + unsigned long tmp_end; + int locked = 1; + long pages; + + *prev = vma; + + while (start < end) { + /* + * We might have temporarily dropped the lock. For example, + * our VMA might have been split. + */ + if (!vma || start >= vma->vm_end) { + vma = vma_lookup(mm, start); + if (!vma) + return -ENOMEM; + } + + tmp_end = min_t(unsigned long, end, vma->vm_end); + /* Populate (prefault) page tables readable/writable. */ + pages = faultin_vma_page_range(vma, start, tmp_end, write, + &locked); + if (!locked) { + mmap_read_lock(mm); + locked = 1; + *prev = NULL; + vma = NULL; + } + if (pages < 0) { + switch (pages) { + case -EINTR: + return -EINTR; + case -EINVAL: /* Incompatible mappings / permissions. */ + return -EINVAL; + case -EHWPOISON: + return -EHWPOISON; + case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ + return -EFAULT; + default: + pr_warn_once("%s: unhandled return value: %ld\n", + __func__, pages); + fallthrough; + case -ENOMEM: + return -ENOMEM; + } + } + start += pages * PAGE_SIZE; + } + return 0; +} + +/* + * Application wants to free up the pages and associated backing store. + * This is effectively punching a hole into the middle of a file. + */ +static long madvise_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + loff_t offset; + int error; + struct file *f; + struct mm_struct *mm = vma->vm_mm; + + *prev = NULL; /* tell sys_madvise we drop mmap_lock */ + + if (vma->vm_flags & VM_LOCKED) + return -EINVAL; + + f = vma->vm_file; + + if (!f || !f->f_mapping || !f->f_mapping->host) { + return -EINVAL; + } + + if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + return -EACCES; + + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + + /* + * Filesystem's fallocate may need to take i_rwsem. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_lock. + */ + get_file(f); + if (userfaultfd_remove(vma, start, end)) { + /* mmap_lock was not released by userfaultfd_remove() */ + mmap_read_unlock(mm); + } + error = vfs_fallocate(f, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, end - start); + fput(f); + mmap_read_lock(mm); + return error; +} + +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error; + struct anon_vma_name *anon_name; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); + case MADV_FREE: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) + return -EINVAL; + new_flags &= ~VM_DONTCOPY; + break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return -EINVAL; + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + return -EINVAL; + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); + } + + anon_name = anon_vma_name(vma); + anon_vma_name_get(anon_name); + error = madvise_update_vma(vma, prev, start, end, new_flags, + anon_name); + anon_vma_name_put(anon_name); + +out: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) +{ + unsigned long size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + + for (; start < end; start += size) { + unsigned long pfn; + struct page *page; + int ret; + + ret = get_user_pages_fast(start, 1, 0, &page); + if (ret != 1) + return ret; + pfn = page_to_pfn(page); + + /* + * When soft offlining hugepages, after migrating the page + * we dissolve it, therefore in the second loop "page" will + * no longer be a compound page. + */ + size = page_size(compound_head(page)); + + if (behavior == MADV_SOFT_OFFLINE) { + pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", + pfn, start); + ret = soft_offline_page(pfn, MF_COUNT_INCREASED); + } else { + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + pfn, start); + ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); + if (ret == -EOPNOTSUPP) + ret = 0; + } + + if (ret) + return ret; + } + + return 0; +} +#endif + +static bool +madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_DOFORK: + case MADV_DONTFORK: + case MADV_NORMAL: + case MADV_SEQUENTIAL: + case MADV_RANDOM: + case MADV_REMOVE: + case MADV_WILLNEED: + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: +#endif + case MADV_DONTDUMP: + case MADV_DODUMP: + case MADV_WIPEONFORK: + case MADV_KEEPONFORK: +#ifdef CONFIG_MEMORY_FAILURE + case MADV_SOFT_OFFLINE: + case MADV_HWPOISON: +#endif + return true; + + default: + return false; + } +} + +static bool process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + case MADV_WILLNEED: + case MADV_COLLAPSE: + return true; + default: + return false; + } +} + +/* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_lock held for reading or writing. + */ +static +int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = find_vma(mm, prev->vm_end); + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); + } + + return unmapped_error; +} + +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long anon_name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (struct anon_vma_name *)anon_name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, struct anon_vma_name *anon_name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ +/* + * The madvise(2) system call. + * + * Applications can use madvise() to advise the kernel how it should + * handle paging I/O in this VM area. The idea is to help the kernel + * use appropriate read-ahead and caching techniques. The information + * provided is advisory only, and can be safely disregarded by the + * kernel without affecting the correct operation of the application. + * + * behavior values: + * MADV_NORMAL - the default behavior is to read clusters. This + * results in some read-ahead and read-behind. + * MADV_RANDOM - the system should read the minimum amount of data + * on any access, since it is unlikely that the appli- + * cation will need more than what it asks for. + * MADV_SEQUENTIAL - pages in the given range will probably be accessed + * once, so they can be aggressively read ahead, and + * can be freed soon after they are accessed. + * MADV_WILLNEED - the application is notifying the system to read + * some pages ahead. + * MADV_DONTNEED - the application is finished with the given range, + * so the kernel can free resources associated with it. + * MADV_FREE - the application marks pages in the given range as lazy free, + * where actual purges are postponed until memory pressure happens. + * MADV_REMOVE - the application wants to free up the given range of + * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_WIPEONFORK - present the child process with zero-filled memory in this + * range after a fork. + * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK + * MADV_HWPOISON - trigger memory error handler as if the given memory range + * were corrupted by unrecoverable hardware memory failure. + * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * MADV_HUGEPAGE - the application wants to back the given range by transparent + * huge pages in the future. Existing pages might be coalesced and + * new pages might be allocated as THP. + * MADV_NOHUGEPAGE - mark the given range as not worth being backed by + * transparent huge pages so the existing pages will not be + * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. + * MADV_DONTDUMP - the application wants to prevent pages in the given range + * from being included in its core dump. + * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure happens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. + * MADV_POPULATE_READ - populate (prefault) page tables readable by + * triggering read faults if required + * MADV_POPULATE_WRITE - populate (prefault) page tables writable by + * triggering write faults if required + * + * return values: + * zero - success + * -EINVAL - start + len < 0, start is not page-aligned, + * "behavior" is not a valid value, or application + * is attempting to release locked or shared pages, + * or the specified address range includes file, Huge TLB, + * MAP_SHARED or VMPFNMAP range. + * -ENOMEM - addresses in the specified range are not currently + * mapped, or are outside the AS of the process. + * -EIO - an I/O error occurred while paging in data. + * -EBADF - map exists, but area maps something that isn't a file. + * -EAGAIN - a kernel resource was temporarily unavailable. + */ +int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) +{ + unsigned long end; + int error; + int write; + size_t len; + struct blk_plug plug; + + start = untagged_addr(start); + + if (!madvise_behavior_valid(behavior)) + return -EINVAL; + + if (!PAGE_ALIGNED(start)) + return -EINVAL; + len = PAGE_ALIGN(len_in); + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_inject_error(behavior, start, start + len_in); +#endif + + write = madvise_need_mmap_write(behavior); + if (write) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + + blk_start_plug(&plug); + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + blk_finish_plug(&plug); + if (write) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); + + return error; +} + +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + return do_madvise(current->mm, start, len_in, behavior); +} + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + task = pidfd_get_task(pidfd, &f_flags); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto free_iov; + } + + if (!process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ + mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + /* + * Require CAP_SYS_NICE for influencing process performance. Note that + * only non-destructive hints are currently supported. + */ + if (!capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + ret = (total_len - iov_iter_count(&iter)) ? : ret; + +release_mm: + mmput(mm); +release_task: + put_task_struct(task); +free_iov: + kfree(iov); +out: + return ret; +} diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c new file mode 100644 index 000000000..1b0ab8fcf --- /dev/null +++ b/mm/mapping_dirty_helpers.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +/** + * struct wp_walk - Private struct for pagetable walk callbacks + * @range: Range for mmu notifiers + * @tlbflush_start: Address of first modified pte + * @tlbflush_end: Address of last modified pte + 1 + * @total: Total number of modified ptes + */ +struct wp_walk { + struct mmu_notifier_range range; + unsigned long tlbflush_start; + unsigned long tlbflush_end; + unsigned long total; +}; + +/** + * wp_pte - Write-protect a pte + * @pte: Pointer to the pte + * @addr: The start of protecting virtual address + * @end: The end of protecting virtual address + * @walk: pagetable walk callback argument + * + * The function write-protects a pte and records the range in + * virtual address space of touched ptes for efficient range TLB flushes. + */ +static int wp_pte(pte_t *pte, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + pte_t ptent = *pte; + + if (pte_write(ptent)) { + pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); + + ptent = pte_wrprotect(old_pte); + ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); + wpwalk->total++; + wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); + wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, + addr + PAGE_SIZE); + } + + return 0; +} + +/** + * struct clean_walk - Private struct for the clean_record_pte function. + * @base: struct wp_walk we derive from + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap + * @bitmap: Bitmap with one bit for each page offset in the address_space range + * covered. + * @start: Address_space page offset of first modified pte relative + * to @bitmap_pgoff + * @end: Address_space page offset of last modified pte relative + * to @bitmap_pgoff + */ +struct clean_walk { + struct wp_walk base; + pgoff_t bitmap_pgoff; + unsigned long *bitmap; + pgoff_t start; + pgoff_t end; +}; + +#define to_clean_walk(_wpwalk) container_of(_wpwalk, struct clean_walk, base) + +/** + * clean_record_pte - Clean a pte and record its address space offset in a + * bitmap + * @pte: Pointer to the pte + * @addr: The start of virtual address to be clean + * @end: The end of virtual address to be clean + * @walk: pagetable walk callback argument + * + * The function cleans a pte and records the range in + * virtual address space of touched ptes for efficient TLB flushes. + * It also records dirty ptes in a bitmap representing page offsets + * in the address_space, as well as the first and last of the bits + * touched. + */ +static int clean_record_pte(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + struct clean_walk *cwalk = to_clean_walk(wpwalk); + pte_t ptent = *pte; + + if (pte_dirty(ptent)) { + pgoff_t pgoff = ((addr - walk->vma->vm_start) >> PAGE_SHIFT) + + walk->vma->vm_pgoff - cwalk->bitmap_pgoff; + pte_t old_pte = ptep_modify_prot_start(walk->vma, addr, pte); + + ptent = pte_mkclean(old_pte); + ptep_modify_prot_commit(walk->vma, addr, pte, old_pte, ptent); + + wpwalk->total++; + wpwalk->tlbflush_start = min(wpwalk->tlbflush_start, addr); + wpwalk->tlbflush_end = max(wpwalk->tlbflush_end, + addr + PAGE_SIZE); + + __set_bit(pgoff, cwalk->bitmap); + cwalk->start = min(cwalk->start, pgoff); + cwalk->end = max(cwalk->end, pgoff + 1); + } + + return 0; +} + +/* + * wp_clean_pmd_entry - The pagewalk pmd callback. + * + * Dirty-tracking should take place on the PTE level, so + * WARN() if encountering a dirty huge pmd. + * Furthermore, never split huge pmds, since that currently + * causes dirty info loss. The pagefault handler should do + * that if needed. + */ +static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t pmdval = pmd_read_atomic(pmd); + + if (!pmd_trans_unstable(&pmdval)) + return 0; + + if (pmd_none(pmdval)) { + walk->action = ACTION_AGAIN; + return 0; + } + + /* Huge pmd, present or migrated */ + walk->action = ACTION_CONTINUE; + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + WARN_ON(pmd_write(pmdval) || pmd_dirty(pmdval)); + + return 0; +} + +/* + * wp_clean_pud_entry - The pagewalk pud callback. + * + * Dirty-tracking should take place on the PTE level, so + * WARN() if encountering a dirty huge puds. + * Furthermore, never split huge puds, since that currently + * causes dirty info loss. The pagefault handler should do + * that if needed. + */ +static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t pudval = READ_ONCE(*pud); + + if (!pud_trans_unstable(&pudval)) + return 0; + + if (pud_none(pudval)) { + walk->action = ACTION_AGAIN; + return 0; + } + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + /* Huge pud */ + walk->action = ACTION_CONTINUE; + if (pud_trans_huge(pudval) || pud_devmap(pudval)) + WARN_ON(pud_write(pudval) || pud_dirty(pudval)); +#endif + + return 0; +} + +/* + * wp_clean_pre_vma - The pagewalk pre_vma callback. + * + * The pre_vma callback performs the cache flush, stages the tlb flush + * and calls the necessary mmu notifiers. + */ +static int wp_clean_pre_vma(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + + wpwalk->tlbflush_start = end; + wpwalk->tlbflush_end = start; + + mmu_notifier_range_init(&wpwalk->range, MMU_NOTIFY_PROTECTION_PAGE, 0, + walk->vma, walk->mm, start, end); + mmu_notifier_invalidate_range_start(&wpwalk->range); + flush_cache_range(walk->vma, start, end); + + /* + * We're not using tlb_gather_mmu() since typically + * only a small subrange of PTEs are affected, whereas + * tlb_gather_mmu() records the full range. + */ + inc_tlb_flush_pending(walk->mm); + + return 0; +} + +/* + * wp_clean_post_vma - The pagewalk post_vma callback. + * + * The post_vma callback performs the tlb flush and calls necessary mmu + * notifiers. + */ +static void wp_clean_post_vma(struct mm_walk *walk) +{ + struct wp_walk *wpwalk = walk->private; + + if (mm_tlb_flush_nested(walk->mm)) + flush_tlb_range(walk->vma, wpwalk->range.start, + wpwalk->range.end); + else if (wpwalk->tlbflush_end > wpwalk->tlbflush_start) + flush_tlb_range(walk->vma, wpwalk->tlbflush_start, + wpwalk->tlbflush_end); + + mmu_notifier_invalidate_range_end(&wpwalk->range); + dec_tlb_flush_pending(walk->mm); +} + +/* + * wp_clean_test_walk - The pagewalk test_walk callback. + * + * Won't perform dirty-tracking on COW, read-only or HUGETLB vmas. + */ +static int wp_clean_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); + + /* Skip non-applicable VMAs */ + if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != + (VM_SHARED | VM_MAYWRITE)) + return 1; + + return 0; +} + +static const struct mm_walk_ops clean_walk_ops = { + .pte_entry = clean_record_pte, + .pmd_entry = wp_clean_pmd_entry, + .pud_entry = wp_clean_pud_entry, + .test_walk = wp_clean_test_walk, + .pre_vma = wp_clean_pre_vma, + .post_vma = wp_clean_post_vma +}; + +static const struct mm_walk_ops wp_walk_ops = { + .pte_entry = wp_pte, + .pmd_entry = wp_clean_pmd_entry, + .pud_entry = wp_clean_pud_entry, + .test_walk = wp_clean_test_walk, + .pre_vma = wp_clean_pre_vma, + .post_vma = wp_clean_post_vma +}; + +/** + * wp_shared_mapping_range - Write-protect all ptes in an address space range + * @mapping: The address_space we want to write protect + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * + * Note: This function currently skips transhuge page-table entries, since + * it's intended for dirty-tracking on the PTE level. It will warn on + * encountering transhuge write-enabled entries, though, and can easily be + * extended to handle them as well. + * + * Return: The number of ptes actually write-protected. Note that + * already write-protected ptes are not counted. + */ +unsigned long wp_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr) +{ + struct wp_walk wpwalk = { .total = 0 }; + + i_mmap_lock_read(mapping); + WARN_ON(walk_page_mapping(mapping, first_index, nr, &wp_walk_ops, + &wpwalk)); + i_mmap_unlock_read(mapping); + + return wpwalk.total; +} +EXPORT_SYMBOL_GPL(wp_shared_mapping_range); + +/** + * clean_record_shared_mapping_range - Clean and record all ptes in an + * address space range + * @mapping: The address_space we want to clean + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * @bitmap_pgoff: The page offset of the first bit in @bitmap + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to + * cover the whole range @first_index..@first_index + @nr. + * @start: Pointer to number of the first set bit in @bitmap. + * is modified as new bits are set by the function. + * @end: Pointer to the number of the last set bit in @bitmap. + * none set. The value is modified as new bits are set by the function. + * + * Note: When this function returns there is no guarantee that a CPU has + * not already dirtied new ptes. However it will not clean any ptes not + * reported in the bitmap. The guarantees are as follows: + * a) All ptes dirty when the function starts executing will end up recorded + * in the bitmap. + * b) All ptes dirtied after that will either remain dirty, be recorded in the + * bitmap or both. + * + * If a caller needs to make sure all dirty ptes are picked up and none + * additional are added, it first needs to write-protect the address-space + * range and make sure new writers are blocked in page_mkwrite() or + * pfn_mkwrite(). And then after a TLB flush following the write-protection + * pick up all dirty bits. + * + * This function currently skips transhuge page-table entries, since + * it's intended for dirty-tracking on the PTE level. It will warn on + * encountering transhuge dirty entries, though, and can easily be extended + * to handle them as well. + * + * Return: The number of dirty ptes actually cleaned. + */ +unsigned long clean_record_shared_mapping_range(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end) +{ + bool none_set = (*start >= *end); + struct clean_walk cwalk = { + .base = { .total = 0 }, + .bitmap_pgoff = bitmap_pgoff, + .bitmap = bitmap, + .start = none_set ? nr : *start, + .end = none_set ? 0 : *end, + }; + + i_mmap_lock_read(mapping); + WARN_ON(walk_page_mapping(mapping, first_index, nr, &clean_walk_ops, + &cwalk.base)); + i_mmap_unlock_read(mapping); + + *start = cwalk.start; + *end = cwalk.end; + + return cwalk.base.total; +} +EXPORT_SYMBOL_GPL(clean_record_shared_mapping_range); diff --git a/mm/memblock.c b/mm/memblock.c new file mode 100644 index 000000000..511d4783d --- /dev/null +++ b/mm/memblock.c @@ -0,0 +1,2175 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Procedures for maintaining information about logical memory blocks. + * + * Peter Bergner, IBM Corp. June 2001. + * Copyright (C) 2001 Peter Bergner. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +#define INIT_MEMBLOCK_REGIONS 128 +#define INIT_PHYSMEM_REGIONS 4 + +#ifndef INIT_MEMBLOCK_RESERVED_REGIONS +# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS +#endif + +#ifndef INIT_MEMBLOCK_MEMORY_REGIONS +#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS +#endif + +/** + * DOC: memblock overview + * + * Memblock is a method of managing memory regions during the early + * boot period when the usual kernel memory allocators are not up and + * running. + * + * Memblock views the system memory as collections of contiguous + * regions. There are several types of these collections: + * + * * ``memory`` - describes the physical memory available to the + * kernel; this may differ from the actual physical memory installed + * in the system, for instance when the memory is restricted with + * ``mem=`` command line parameter + * * ``reserved`` - describes the regions that were allocated + * * ``physmem`` - describes the actual physical memory available during + * boot regardless of the possible restrictions and memory hot(un)plug; + * the ``physmem`` type is only available on some architectures. + * + * Each region is represented by struct memblock_region that + * defines the region extents, its attributes and NUMA node id on NUMA + * systems. Every memory type is described by the struct memblock_type + * which contains an array of memory regions along with + * the allocator metadata. The "memory" and "reserved" types are nicely + * wrapped with struct memblock. This structure is statically + * initialized at build time. The region arrays are initially sized to + * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and + * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array + * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS. + * The memblock_allow_resize() enables automatic resizing of the region + * arrays during addition of new regions. This feature should be used + * with care so that memory allocated for the region array will not + * overlap with areas that should be reserved, for example initrd. + * + * The early architecture setup should tell memblock what the physical + * memory layout is by using memblock_add() or memblock_add_node() + * functions. The first function does not assign the region to a NUMA + * node and it is appropriate for UMA systems. Yet, it is possible to + * use it on NUMA systems as well and assign the region to a NUMA node + * later in the setup process using memblock_set_node(). The + * memblock_add_node() performs such an assignment directly. + * + * Once memblock is setup the memory can be allocated using one of the + * API variants: + * + * * memblock_phys_alloc*() - these functions return the **physical** + * address of the allocated memory + * * memblock_alloc*() - these functions return the **virtual** address + * of the allocated memory. + * + * Note, that both API variants use implicit assumptions about allowed + * memory ranges and the fallback methods. Consult the documentation + * of memblock_alloc_internal() and memblock_alloc_range_nid() + * functions for more elaborate description. + * + * As the system boot progresses, the architecture specific mem_init() + * function frees all the memory to the buddy page allocator. + * + * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the + * memblock data structures (except "physmem") will be discarded after the + * system initialization completes. + */ + +#ifndef CONFIG_NUMA +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; +unsigned long long max_possible_pfn; + +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; +#endif + +struct memblock memblock __initdata_memblock = { + .memory.regions = memblock_memory_init_regions, + .memory.cnt = 1, /* empty dummy entry */ + .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, + .memory.name = "memory", + + .reserved.regions = memblock_reserved_init_regions, + .reserved.cnt = 1, /* empty dummy entry */ + .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS, + .reserved.name = "reserved", + + .bottom_up = false, + .current_limit = MEMBLOCK_ALLOC_ANYWHERE, +}; + +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +struct memblock_type physmem = { + .regions = memblock_physmem_init_regions, + .cnt = 1, /* empty dummy entry */ + .max = INIT_PHYSMEM_REGIONS, + .name = "physmem", +}; +#endif + +/* + * keep a pointer to &memblock.memory in the text section to use it in + * __next_mem_range() and its helpers. + * For architectures that do not keep memblock data after init, this + * pointer will be reset to NULL at memblock_discard() + */ +static __refdata struct memblock_type *memblock_memory = &memblock.memory; + +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) + +#define memblock_dbg(fmt, ...) \ + do { \ + if (memblock_debug) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +static int memblock_debug __initdata_memblock; +static bool system_has_some_mirror __initdata_memblock = false; +static int memblock_can_resize __initdata_memblock; +static int memblock_memory_in_slab __initdata_memblock = 0; +static int memblock_reserved_in_slab __initdata_memblock = 0; + +static enum memblock_flags __init_memblock choose_memblock_flags(void) +{ + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; +} + +/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ +static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) +{ + return *size = min(*size, PHYS_ADDR_MAX - base); +} + +/* + * Address comparison utilities + */ +static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +bool __init_memblock memblock_overlaps_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + unsigned long i; + + memblock_cap_size(base, &size); + + for (i = 0; i < type->cnt; i++) + if (memblock_addrs_overlap(base, size, type->regions[i].base, + type->regions[i].size)) + break; + return i < type->cnt; +} + +/** + * __memblock_find_range_bottom_up - find free area utility in bottom-up + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes + * + * Utility called from memblock_find_in_range_node(), find free area bottom-up. + * + * Return: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid, + enum memblock_flags flags) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + cand = round_up(this_start, align); + if (cand < this_end && this_end - cand >= size) + return cand; + } + + return 0; +} + +/** + * __memblock_find_range_top_down - find free area utility, in top-down + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes + * + * Utility called from memblock_find_in_range_node(), find free area top-down. + * + * Return: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid, + enum memblock_flags flags) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, + NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + if (this_end < size) + continue; + + cand = round_down(this_end - size, align); + if (cand >= this_start) + return cand; + } + + return 0; +} + +/** + * memblock_find_in_range_node - find free area in given range and node + * @size: size of free area to find + * @align: alignment of free area to find + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes + * + * Find @size free area aligned to @align in the specified range and node. + * + * Return: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid, + enum memblock_flags flags) +{ + /* pump up @end */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE || + end == MEMBLOCK_ALLOC_NOLEAKTRACE) + end = memblock.current_limit; + + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); + end = max(start, end); + + if (memblock_bottom_up()) + return __memblock_find_range_bottom_up(start, end, size, align, + nid, flags); + else + return __memblock_find_range_top_down(start, end, size, align, + nid, flags); +} + +/** + * memblock_find_in_range - find free area in given range + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or + * %MEMBLOCK_ALLOC_ACCESSIBLE + * @size: size of free area to find + * @align: alignment of free area to find + * + * Find @size free area aligned to @align in the specified range. + * + * Return: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align) +{ + phys_addr_t ret; + enum memblock_flags flags = choose_memblock_flags(); + +again: + ret = memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", + &size); + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + + return ret; +} + +static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) +{ + type->total_size -= type->regions[r].size; + memmove(&type->regions[r], &type->regions[r + 1], + (type->cnt - (r + 1)) * sizeof(type->regions[r])); + type->cnt--; + + /* Special case for empty arrays */ + if (type->cnt == 0) { + WARN_ON(type->total_size != 0); + type->cnt = 1; + type->regions[0].base = 0; + type->regions[0].size = 0; + type->regions[0].flags = 0; + memblock_set_region_node(&type->regions[0], MAX_NUMNODES); + } +} + +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK +/** + * memblock_discard - discard memory and reserved arrays if they were allocated + */ +void __init memblock_discard(void) +{ + phys_addr_t addr, size; + + if (memblock.reserved.regions != memblock_reserved_init_regions) { + addr = __pa(memblock.reserved.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); + if (memblock_reserved_in_slab) + kfree(memblock.reserved.regions); + else + memblock_free_late(addr, size); + } + + if (memblock.memory.regions != memblock_memory_init_regions) { + addr = __pa(memblock.memory.regions); + size = PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.memory.max); + if (memblock_memory_in_slab) + kfree(memblock.memory.regions); + else + memblock_free_late(addr, size); + } + + memblock_memory = NULL; +} +#endif + +/** + * memblock_double_array - double the size of the memblock regions array + * @type: memblock type of the regions array being doubled + * @new_area_start: starting address of memory range to avoid overlap with + * @new_area_size: size of memory range to avoid overlap with + * + * Double the size of the @type regions array. If memblock is being used to + * allocate memory for a new reserved regions array and there is a previously + * allocated memory range [@new_area_start, @new_area_start + @new_area_size] + * waiting to be reserved, ensure the memory used by the new array does + * not overlap. + * + * Return: + * 0 on success, -1 on failure. + */ +static int __init_memblock memblock_double_array(struct memblock_type *type, + phys_addr_t new_area_start, + phys_addr_t new_area_size) +{ + struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; + phys_addr_t old_size, new_size, addr, new_end; + int use_slab = slab_is_available(); + int *in_slab; + + /* We don't allow resizing until we know about the reserved regions + * of memory that aren't suitable for allocation + */ + if (!memblock_can_resize) + return -1; + + /* Calculate new doubled size */ + old_size = type->max * sizeof(struct memblock_region); + new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); + + /* Retrieve the slab flag */ + if (type == &memblock.memory) + in_slab = &memblock_memory_in_slab; + else + in_slab = &memblock_reserved_in_slab; + + /* Try to find some space for it */ + if (use_slab) { + new_array = kmalloc(new_size, GFP_KERNEL); + addr = new_array ? __pa(new_array) : 0; + } else { + /* only exclude range when trying to double reserved.regions */ + if (type != &memblock.reserved) + new_area_start = new_area_size = 0; + + addr = memblock_find_in_range(new_area_start + new_area_size, + memblock.current_limit, + new_alloc_size, PAGE_SIZE); + if (!addr && new_area_size) + addr = memblock_find_in_range(0, + min(new_area_start, memblock.current_limit), + new_alloc_size, PAGE_SIZE); + + new_array = addr ? __va(addr) : NULL; + } + if (!addr) { + pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", + type->name, type->max, type->max * 2); + return -1; + } + + new_end = addr + new_size - 1; + memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]", + type->name, type->max * 2, &addr, &new_end); + + /* + * Found space, we now need to move the array over before we add the + * reserved region since it may be our reserved array itself that is + * full. + */ + memcpy(new_array, type->regions, old_size); + memset(new_array + type->max, 0, old_size); + old_array = type->regions; + type->regions = new_array; + type->max <<= 1; + + /* Free old array. We needn't free it if the array is the static one */ + if (*in_slab) + kfree(old_array); + else if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(old_array, old_alloc_size); + + /* + * Reserve the new array if that comes from the memblock. Otherwise, we + * needn't do it + */ + if (!use_slab) + BUG_ON(memblock_reserve(addr, new_alloc_size)); + + /* Update slab flag */ + *in_slab = use_slab; + + return 0; +} + +/** + * memblock_merge_regions - merge neighboring compatible regions + * @type: memblock type to scan + * + * Scan @type and merge neighboring compatible regions. + */ +static void __init_memblock memblock_merge_regions(struct memblock_type *type) +{ + int i = 0; + + /* cnt never goes below 1 */ + while (i < type->cnt - 1) { + struct memblock_region *this = &type->regions[i]; + struct memblock_region *next = &type->regions[i + 1]; + + if (this->base + this->size != next->base || + memblock_get_region_node(this) != + memblock_get_region_node(next) || + this->flags != next->flags) { + BUG_ON(this->base + this->size > next->base); + i++; + continue; + } + + this->size += next->size; + /* move forward from next + 1, index of which is i + 2 */ + memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next)); + type->cnt--; + } +} + +/** + * memblock_insert_region - insert new memblock region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * @nid: node id of the new region + * @flags: flags of the new region + * + * Insert new memblock region [@base, @base + @size) into @type at @idx. + * @type must already have extra room to accommodate the new region. + */ +static void __init_memblock memblock_insert_region(struct memblock_type *type, + int idx, phys_addr_t base, + phys_addr_t size, + int nid, + enum memblock_flags flags) +{ + struct memblock_region *rgn = &type->regions[idx]; + + BUG_ON(type->cnt >= type->max); + memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); + rgn->base = base; + rgn->size = size; + rgn->flags = flags; + memblock_set_region_node(rgn, nid); + type->cnt++; + type->total_size += size; +} + +/** + * memblock_add_range - add new memblock region + * @type: memblock type to add new region into + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * @flags: flags of the new region + * + * Add new memblock region [@base, @base + @size) into @type. The new region + * is allowed to overlap with existing ones - overlaps don't affect already + * existing regions. @type is guaranteed to be minimal (all neighbouring + * compatible regions are merged) after the addition. + * + * Return: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_add_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) +{ + bool insert = false; + phys_addr_t obase = base; + phys_addr_t end = base + memblock_cap_size(base, &size); + int idx, nr_new; + struct memblock_region *rgn; + + if (!size) + return 0; + + /* special case for empty array */ + if (type->regions[0].size == 0) { + WARN_ON(type->cnt != 1 || type->total_size); + type->regions[0].base = base; + type->regions[0].size = size; + type->regions[0].flags = flags; + memblock_set_region_node(&type->regions[0], nid); + type->total_size = size; + return 0; + } + + /* + * The worst case is when new range overlaps all existing regions, + * then we'll need type->cnt + 1 empty regions in @type. So if + * type->cnt * 2 + 1 is less than type->max, we know + * that there is enough empty regions in @type, and we can insert + * regions directly. + */ + if (type->cnt * 2 + 1 < type->max) + insert = true; + +repeat: + /* + * The following is executed twice. Once with %false @insert and + * then with %true. The first counts the number of regions needed + * to accommodate the new area. The second actually inserts them. + */ + base = obase; + nr_new = 0; + + for_each_memblock_type(idx, type, rgn) { + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; + + if (rbase >= end) + break; + if (rend <= base) + continue; + /* + * @rgn overlaps. If it separates the lower part of new + * area, insert that portion. + */ + if (rbase > base) { +#ifdef CONFIG_NUMA + WARN_ON(nid != memblock_get_region_node(rgn)); +#endif + WARN_ON(flags != rgn->flags); + nr_new++; + if (insert) + memblock_insert_region(type, idx++, base, + rbase - base, nid, + flags); + } + /* area below @rend is dealt with, forget about it */ + base = min(rend, end); + } + + /* insert the remaining portion */ + if (base < end) { + nr_new++; + if (insert) + memblock_insert_region(type, idx, base, end - base, + nid, flags); + } + + if (!nr_new) + return 0; + + /* + * If this was the first round, resize array and repeat for actual + * insertions; otherwise, merge and return. + */ + if (!insert) { + while (type->cnt + nr_new > type->max) + if (memblock_double_array(type, obase, size) < 0) + return -ENOMEM; + insert = true; + goto repeat; + } else { + memblock_merge_regions(type); + return 0; + } +} + +/** + * memblock_add_node - add new memblock region within a NUMA node + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * @flags: flags of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, + int nid, enum memblock_flags flags) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__, + &base, &end, nid, flags, (void *)_RET_IP_); + + return memblock_add_range(&memblock.memory, base, size, nid, flags); +} + +/** + * memblock_add - add new memblock region + * @base: base address of the new region + * @size: size of the new region + * + * Add new memblock region [@base, @base + @size) to the "memory" + * type. See memblock_add_range() description for mode details + * + * Return: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); +} + +/** + * memblock_isolate_range - isolate given range into disjoint memblocks + * @type: memblock type to isolate range for + * @base: base of range to isolate + * @size: size of range to isolate + * @start_rgn: out parameter for the start of isolated region + * @end_rgn: out parameter for the end of isolated region + * + * Walk @type and ensure that regions don't cross the boundaries defined by + * [@base, @base + @size). Crossing regions are split at the boundaries, + * which may create at most two more regions. The index of the first + * region inside the range is returned in *@start_rgn and end in *@end_rgn. + * + * Return: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_isolate_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, + int *start_rgn, int *end_rgn) +{ + phys_addr_t end = base + memblock_cap_size(base, &size); + int idx; + struct memblock_region *rgn; + + *start_rgn = *end_rgn = 0; + + if (!size) + return 0; + + /* we'll create at most two more regions */ + while (type->cnt + 2 > type->max) + if (memblock_double_array(type, base, size) < 0) + return -ENOMEM; + + for_each_memblock_type(idx, type, rgn) { + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; + + if (rbase >= end) + break; + if (rend <= base) + continue; + + if (rbase < base) { + /* + * @rgn intersects from below. Split and continue + * to process the next region - the new top half. + */ + rgn->base = base; + rgn->size -= base - rbase; + type->total_size -= base - rbase; + memblock_insert_region(type, idx, rbase, base - rbase, + memblock_get_region_node(rgn), + rgn->flags); + } else if (rend > end) { + /* + * @rgn intersects from above. Split and redo the + * current region - the new bottom half. + */ + rgn->base = end; + rgn->size -= end - rbase; + type->total_size -= end - rbase; + memblock_insert_region(type, idx--, rbase, end - rbase, + memblock_get_region_node(rgn), + rgn->flags); + } else { + /* @rgn is fully contained, record it */ + if (!*end_rgn) + *start_rgn = idx; + *end_rgn = idx + 1; + } + } + + return 0; +} + +static int __init_memblock memblock_remove_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = end_rgn - 1; i >= start_rgn; i--) + memblock_remove_region(type, i); + return 0; +} + +int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + return memblock_remove_range(&memblock.memory, base, size); +} + +/** + * memblock_free - free boot memory allocation + * @ptr: starting address of the boot memory allocation + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +void __init_memblock memblock_free(void *ptr, size_t size) +{ + if (ptr) + memblock_phys_free(__pa(ptr), size); +} + +/** + * memblock_phys_free - free boot memory block + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * Free boot memory block previously allocated by memblock_alloc_xx() API. + * The freeing memory will not be released to the buddy allocator. + */ +int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + kmemleak_free_part_phys(base, size); + return memblock_remove_range(&memblock.reserved, base, size); +} + +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); +} + +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0); +} +#endif + +/** + * memblock_setclr_flag - set or clear flag for a memory region + * @base: base address of the region + * @size: size of the region + * @set: set or clear the flag + * @flag: the flag to update + * + * This function isolates region [@base, @base + @size), and sets/clears flag + * + * Return: 0 on success, -errno on failure. + */ +static int __init_memblock memblock_setclr_flag(phys_addr_t base, + phys_addr_t size, int set, int flag) +{ + struct memblock_type *type = &memblock.memory; + int i, ret, start_rgn, end_rgn; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) { + struct memblock_region *r = &type->regions[i]; + + if (set) + r->flags |= flag; + else + r->flags &= ~flag; + } + + memblock_merge_regions(type); + return 0; +} + +/** + * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); +} + +/** + * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); +} + +/** + * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) +{ + if (!mirrored_kernelcore) + return 0; + + system_has_some_mirror = true; + + return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); +} + +/** + * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. + * @base: the base phys addr of the region + * @size: the size of the region + * + * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the + * direct mapping of the physical memory. These regions will still be + * covered by the memory map. The struct page representing NOMAP memory + * frames in the memory map will be PageReserved() + * + * Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from + * memblock, the caller must inform kmemleak to ignore that memory + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); +} + +/** + * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); +} + +static bool should_skip_region(struct memblock_type *type, + struct memblock_region *m, + int nid, int flags) +{ + int m_nid = memblock_get_region_node(m); + + /* we never skip regions when iterating memblock.reserved or physmem */ + if (type != memblock_memory) + return false; + + /* only memory regions are associated with nodes, check it */ + if (nid != NUMA_NO_NODE && nid != m_nid) + return true; + + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && + !(flags & MEMBLOCK_HOTPLUG)) + return true; + + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + return true; + + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + return true; + + /* skip driver-managed memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m)) + return true; + + return false; +} + +/** + * __next_mem_range - next function for for_each_free_mem_range() etc. + * @idx: pointer to u64 loop variable + * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL + * + * Find the first area from *@idx which matches @nid, fill the out + * parameters, and update *@idx for the next iteration. The lower 32bit of + * *@idx contains index into type_a and the upper 32bit indexes the + * areas before each region in type_b. For example, if type_b regions + * look like the following, + * + * 0:[0-16), 1:[32-48), 2:[128-130) + * + * The upper 32bit indexes the following regions. + * + * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) + * + * As both region arrays are sorted, the function advances the two indices + * in lockstep and returns each intersection. + */ +void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, + struct memblock_type *type_a, + struct memblock_type *type_b, phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) +{ + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; + + if (WARN_ONCE(nid == MAX_NUMNODES, + "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + for (; idx_a < type_a->cnt; idx_a++) { + struct memblock_region *m = &type_a->regions[idx_a]; + + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); + + if (should_skip_region(type_a, m, nid, flags)) + continue; + + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b < type_b->cnt + 1; idx_b++) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : PHYS_ADDR_MAX; + + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ + if (r_start >= m_end) + break; + /* if the two regions intersect, we're done */ + if (m_start < r_end) { + if (out_start) + *out_start = + max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = m_nid; + /* + * The region which ends first is + * advanced for the next iteration. + */ + if (m_end <= r_end) + idx_a++; + else + idx_b++; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + } + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** + * __next_mem_range_rev - generic next function for for_each_*_range_rev() + * + * @idx: pointer to u64 loop variable + * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes + * @type_a: pointer to memblock_type from where the range is taken + * @type_b: pointer to memblock_type which excludes memory from being taken + * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @out_nid: ptr to int for nid of the range, can be %NULL + * + * Finds the next range from type_a which is not marked as unsuitable + * in type_b. + * + * Reverse of __next_mem_range(). + */ +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, + enum memblock_flags flags, + struct memblock_type *type_a, + struct memblock_type *type_b, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) +{ + int idx_a = *idx & 0xffffffff; + int idx_b = *idx >> 32; + + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + if (*idx == (u64)ULLONG_MAX) { + idx_a = type_a->cnt - 1; + if (type_b != NULL) + idx_b = type_b->cnt; + else + idx_b = 0; + } + + for (; idx_a >= 0; idx_a--) { + struct memblock_region *m = &type_a->regions[idx_a]; + + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; + int m_nid = memblock_get_region_node(m); + + if (should_skip_region(type_a, m, nid, flags)) + continue; + + if (!type_b) { + if (out_start) + *out_start = m_start; + if (out_end) + *out_end = m_end; + if (out_nid) + *out_nid = m_nid; + idx_a--; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + + /* scan areas before each reservation */ + for (; idx_b >= 0; idx_b--) { + struct memblock_region *r; + phys_addr_t r_start; + phys_addr_t r_end; + + r = &type_b->regions[idx_b]; + r_start = idx_b ? r[-1].base + r[-1].size : 0; + r_end = idx_b < type_b->cnt ? + r->base : PHYS_ADDR_MAX; + /* + * if idx_b advanced past idx_a, + * break out to advance idx_a + */ + + if (r_end <= m_start) + break; + /* if the two regions intersect, we're done */ + if (m_end > r_start) { + if (out_start) + *out_start = max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = m_nid; + if (m_start >= r_start) + idx_a--; + else + idx_b--; + *idx = (u32)idx_a | (u64)idx_b << 32; + return; + } + } + } + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/* + * Common iterator interface used to define for_each_mem_pfn_range(). + */ +void __init_memblock __next_mem_pfn_range(int *idx, int nid, + unsigned long *out_start_pfn, + unsigned long *out_end_pfn, int *out_nid) +{ + struct memblock_type *type = &memblock.memory; + struct memblock_region *r; + int r_nid; + + while (++*idx < type->cnt) { + r = &type->regions[*idx]; + r_nid = memblock_get_region_node(r); + + if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) + continue; + if (nid == MAX_NUMNODES || nid == r_nid) + break; + } + if (*idx >= type->cnt) { + *idx = -1; + return; + } + + if (out_start_pfn) + *out_start_pfn = PFN_UP(r->base); + if (out_end_pfn) + *out_end_pfn = PFN_DOWN(r->base + r->size); + if (out_nid) + *out_nid = r_nid; +} + +/** + * memblock_set_node - set node ID on memblock regions + * @base: base of area to set node ID for + * @size: size of area to set node ID for + * @type: memblock type to set node ID for + * @nid: node ID to set + * + * Set the nid of memblock @type regions in [@base, @base + @size) to @nid. + * Regions which cross the area boundaries are split as necessary. + * + * Return: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, + struct memblock_type *type, int nid) +{ +#ifdef CONFIG_NUMA + int start_rgn, end_rgn; + int i, ret; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = start_rgn; i < end_rgn; i++) + memblock_set_region_node(&type->regions[i], nid); + + memblock_merge_regions(type); +#endif + return 0; +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/** + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() + * + * @idx: pointer to u64 loop variable + * @zone: zone in which all of the memory blocks reside + * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL + * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL + * + * This function is meant to be a zone/pfn specific wrapper for the + * for_each_mem_range type iterators. Specifically they are used in the + * deferred memory init routines and as such we were duplicating much of + * this logic throughout the code. So instead of having it in multiple + * locations it seemed like it would make more sense to centralize this to + * one new iterator that does everything they need. + */ +void __init_memblock +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, unsigned long *out_epfn) +{ + int zone_nid = zone_to_nid(zone); + phys_addr_t spa, epa; + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, NULL); + + while (*idx != U64_MAX) { + unsigned long epfn = PFN_DOWN(epa); + unsigned long spfn = PFN_UP(spa); + + /* + * Verify the end is at least past the start of the zone and + * that we have at least one PFN to initialize. + */ + if (zone->zone_start_pfn < epfn && spfn < epfn) { + /* if we went too far just stop searching */ + if (zone_end_pfn(zone) <= spfn) { + *idx = U64_MAX; + break; + } + + if (out_spfn) + *out_spfn = max(zone->zone_start_pfn, spfn); + if (out_epfn) + *out_epfn = min(zone_end_pfn(zone), epfn); + + return; + } + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, NULL); + } + + /* signal end of iteration */ + if (out_spfn) + *out_spfn = ULONG_MAX; + if (out_epfn) + *out_epfn = 0; +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. + * + * For systems with memory mirroring, the allocation is attempted first + * from the regions with mirroring enabled and then retried from any + * memory region. + * + * In addition, function using kmemleak_alloc_phys for allocated boot + * memory block, it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ +phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid, + bool exact_nid) +{ + enum memblock_flags flags = choose_memblock_flags(); + phys_addr_t found; + + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + + if (!align) { + /* Can't use WARNs this early in boot on powerpc */ + dump_stack(); + align = SMP_CACHE_BYTES; + } + +again: + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + + if (nid != NUMA_NO_NODE && !exact_nid) { + found = memblock_find_in_range_node(size, align, start, + end, NUMA_NO_NODE, + flags); + if (found && !memblock_reserve(found, size)) + goto done; + } + + if (flags & MEMBLOCK_MIRROR) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } + + return 0; + +done: + /* + * Skip kmemleak for those places like kasan_init() and + * early_pgtable_alloc() due to high volume. + */ + if (end != MEMBLOCK_ALLOC_NOLEAKTRACE) + /* + * Memblock allocated blocks are never reported as + * leaks. This is because many of these blocks are + * only referred via the physical address which is + * not looked up by kmemleak. + */ + kmemleak_alloc_phys(found, size, 0); + + return found; +} + +/** + * memblock_phys_alloc_range - allocate a memory block inside specified range + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (physical address) + * @end: the upper bound of the memory region to allocate (physical address) + * + * Allocate @size bytes in the between @start and @end. + * + * Return: physical address of the allocated memory block on success, + * %0 on failure. + */ +phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, + phys_addr_t align, + phys_addr_t start, + phys_addr_t end) +{ + memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, &start, &end, + (void *)_RET_IP_); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + false); +} + +/** + * memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Allocates memory block from the specified NUMA node. If the node + * has no available memory, attempts to allocated from any node in the + * system. + * + * Return: physical address of the allocated memory block on success, + * %0 on failure. + */ +phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + return memblock_alloc_range_nid(size, align, 0, + MEMBLOCK_ALLOC_ACCESSIBLE, nid, false); +} + +/** + * memblock_alloc_internal - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region to allocate (phys address) + * @max_addr: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes + * + * Allocates memory block using memblock_alloc_range_nid() and + * converts the returned physical address to virtual. + * + * The @min_addr limit is dropped if it can not be satisfied and the allocation + * will fall back to memory below @min_addr. Other constraints, such + * as node and mirrored memory will be handled again in + * memblock_alloc_range_nid(). + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +static void * __init memblock_alloc_internal( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid, bool exact_nid) +{ + phys_addr_t alloc; + + /* + * Detect any accidental use of these APIs after slab is ready, as at + * this moment memblock may be deinitialized already and its + * internal data may be destroyed (after execution of memblock_free_all) + */ + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, nid); + + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid, + exact_nid); + + /* retry allocation without lower limit */ + if (!alloc && min_addr) + alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid, + exact_nid); + + if (!alloc) + return NULL; + + return phys_to_virt(alloc); +} + +/** + * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node + * without zeroing memory + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory. + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_alloc_exact_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + true); +} + +/** + * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing + * memory and without panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory, does not panic if request + * cannot be satisfied. + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + return memblock_alloc_internal(size, align, min_addr, max_addr, nid, + false); +} + +/** + * memblock_alloc_try_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. This function zeroes the allocated memory. + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_alloc_try_nid( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + ptr = memblock_alloc_internal(size, align, + min_addr, max_addr, nid, false); + if (ptr) + memset(ptr, 0, size); + + return ptr; +} + +/** + * memblock_free_late - free pages directly to buddy allocator + * @base: phys starting address of the boot memory block + * @size: size of the boot memory block in bytes + * + * This is only useful when the memblock allocator has already been torn + * down, but we are still initializing the system. Pages are released directly + * to the buddy allocator. + */ +void __init memblock_free_late(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t cursor, end; + + end = base + size - 1; + memblock_dbg("%s: [%pa-%pa] %pS\n", + __func__, &base, &end, (void *)_RET_IP_); + kmemleak_free_part_phys(base, size); + cursor = PFN_UP(base); + end = PFN_DOWN(base + size); + + for (; cursor < end; cursor++) { + memblock_free_pages(pfn_to_page(cursor), cursor, 0); + totalram_pages_inc(); + } +} + +/* + * Remaining API functions + */ + +phys_addr_t __init_memblock memblock_phys_mem_size(void) +{ + return memblock.memory.total_size; +} + +phys_addr_t __init_memblock memblock_reserved_size(void) +{ + return memblock.reserved.total_size; +} + +/* lowest address */ +phys_addr_t __init_memblock memblock_start_of_DRAM(void) +{ + return memblock.memory.regions[0].base; +} + +phys_addr_t __init_memblock memblock_end_of_DRAM(void) +{ + int idx = memblock.memory.cnt - 1; + + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); +} + +static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) +{ + phys_addr_t max_addr = PHYS_ADDR_MAX; + struct memblock_region *r; + + /* + * translate the memory @limit size into the max address within one of + * the memory memblock regions, if the @limit exceeds the total size + * of those regions, max_addr will keep original value PHYS_ADDR_MAX + */ + for_each_mem_region(r) { + if (limit <= r->size) { + max_addr = r->base + limit; + break; + } + limit -= r->size; + } + + return max_addr; +} + +void __init memblock_enforce_memory_limit(phys_addr_t limit) +{ + phys_addr_t max_addr; + + if (!limit) + return; + + max_addr = __find_max_addr(limit); + + /* @limit exceeds the total size of the memory, do nothing */ + if (max_addr == PHYS_ADDR_MAX) + return; + + /* truncate both memory and reserved regions */ + memblock_remove_range(&memblock.memory, max_addr, + PHYS_ADDR_MAX); + memblock_remove_range(&memblock.reserved, max_addr, + PHYS_ADDR_MAX); +} + +void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + if (!size) + return; + + if (!memblock_memory->total_size) { + pr_warn("%s: No memory registered yet\n", __func__); + return; + } + + ret = memblock_isolate_range(&memblock.memory, base, size, + &start_rgn, &end_rgn); + if (ret) + return; + + /* remove all the MAP regions */ + for (i = memblock.memory.cnt - 1; i >= end_rgn; i--) + if (!memblock_is_nomap(&memblock.memory.regions[i])) + memblock_remove_region(&memblock.memory, i); + + for (i = start_rgn - 1; i >= 0; i--) + if (!memblock_is_nomap(&memblock.memory.regions[i])) + memblock_remove_region(&memblock.memory, i); + + /* truncate the reserved regions */ + memblock_remove_range(&memblock.reserved, 0, base); + memblock_remove_range(&memblock.reserved, + base + size, PHYS_ADDR_MAX); +} + +void __init memblock_mem_limit_remove_map(phys_addr_t limit) +{ + phys_addr_t max_addr; + + if (!limit) + return; + + max_addr = __find_max_addr(limit); + + /* @limit exceeds the total size of the memory, do nothing */ + if (max_addr == PHYS_ADDR_MAX) + return; + + memblock_cap_memory_range(0, max_addr); +} + +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + unsigned int left = 0, right = type->cnt; + + do { + unsigned int mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else + return mid; + } while (left < right); + return -1; +} + +bool __init_memblock memblock_is_reserved(phys_addr_t addr) +{ + return memblock_search(&memblock.reserved, addr) != -1; +} + +bool __init_memblock memblock_is_memory(phys_addr_t addr) +{ + return memblock_search(&memblock.memory, addr) != -1; +} + +bool __init_memblock memblock_is_map_memory(phys_addr_t addr) +{ + int i = memblock_search(&memblock.memory, addr); + + if (i == -1) + return false; + return !memblock_is_nomap(&memblock.memory.regions[i]); +} + +int __init_memblock memblock_search_pfn_nid(unsigned long pfn, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + struct memblock_type *type = &memblock.memory; + int mid = memblock_search(type, PFN_PHYS(pfn)); + + if (mid == -1) + return -1; + + *start_pfn = PFN_DOWN(type->regions[mid].base); + *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); + + return memblock_get_region_node(&type->regions[mid]); +} + +/** + * memblock_is_region_memory - check if a region is a subset of memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base + @size) is a subset of a memory block. + * + * Return: + * 0 if false, non-zero if true + */ +bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) +{ + int idx = memblock_search(&memblock.memory, base); + phys_addr_t end = base + memblock_cap_size(base, &size); + + if (idx == -1) + return false; + return (memblock.memory.regions[idx].base + + memblock.memory.regions[idx].size) >= end; +} + +/** + * memblock_is_region_reserved - check if a region intersects reserved memory + * @base: base of region to check + * @size: size of region to check + * + * Check if the region [@base, @base + @size) intersects a reserved + * memory block. + * + * Return: + * True if they intersect, false if not. + */ +bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +{ + return memblock_overlaps_region(&memblock.reserved, base, size); +} + +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + phys_addr_t start, end, orig_start, orig_end; + struct memblock_region *r; + + for_each_mem_region(r) { + orig_start = r->base; + orig_end = r->base + r->size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + r->base = start; + r->size = end - start; + } else { + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; + } + } +} + +void __init_memblock memblock_set_current_limit(phys_addr_t limit) +{ + memblock.current_limit = limit; +} + +phys_addr_t __init_memblock memblock_get_current_limit(void) +{ + return memblock.current_limit; +} + +static void __init_memblock memblock_dump(struct memblock_type *type) +{ + phys_addr_t base, end, size; + enum memblock_flags flags; + int idx; + struct memblock_region *rgn; + + pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); + + for_each_memblock_type(idx, type, rgn) { + char nid_buf[32] = ""; + + base = rgn->base; + size = rgn->size; + end = base + size - 1; + flags = rgn->flags; +#ifdef CONFIG_NUMA + if (memblock_get_region_node(rgn) != MAX_NUMNODES) + snprintf(nid_buf, sizeof(nid_buf), " on node %d", + memblock_get_region_node(rgn)); +#endif + pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n", + type->name, idx, &base, &end, &size, nid_buf, flags); + } +} + +static void __init_memblock __memblock_dump_all(void) +{ + pr_info("MEMBLOCK configuration:\n"); + pr_info(" memory size = %pa reserved size = %pa\n", + &memblock.memory.total_size, + &memblock.reserved.total_size); + + memblock_dump(&memblock.memory); + memblock_dump(&memblock.reserved); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + memblock_dump(&physmem); +#endif +} + +void __init_memblock memblock_dump_all(void) +{ + if (memblock_debug) + __memblock_dump_all(); +} + +void __init memblock_allow_resize(void) +{ + memblock_can_resize = 1; +} + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *start_pg, *end_pg; + phys_addr_t pg, pgend; + + /* + * Convert start_pfn/end_pfn to a struct page pointer. + */ + start_pg = pfn_to_page(start_pfn - 1) + 1; + end_pg = pfn_to_page(end_pfn - 1) + 1; + + /* + * Convert to physical addresses, and round start upwards and end + * downwards. + */ + pg = PAGE_ALIGN(__pa(start_pg)); + pgend = __pa(end_pg) & PAGE_MASK; + + /* + * If there are free pages between these, free the section of the + * memmap array. + */ + if (pg < pgend) + memblock_phys_free(pg, pgend - pg); +} + +/* + * The mem_map array can get very big. Free the unused area of the memory map. + */ +static void __init free_unused_memmap(void) +{ + unsigned long start, end, prev_end = 0; + int i; + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) || + IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; + + /* + * This relies on each bank being in address order. + * The banks are sorted previously in bootmem_init(). + */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { +#ifdef CONFIG_SPARSEMEM + /* + * Take care not to free memmap entries that don't exist + * due to SPARSEMEM sections which aren't present. + */ + start = min(start, ALIGN(prev_end, PAGES_PER_SECTION)); +#endif + /* + * Align down here since many operations in VM subsystem + * presume that there are no holes in the memory map inside + * a pageblock + */ + start = pageblock_start_pfn(start); + + /* + * If we had a previous bank, and there is a space + * between the current bank and the previous, free it. + */ + if (prev_end && prev_end < start) + free_memmap(prev_end, start); + + /* + * Align up here since many operations in VM subsystem + * presume that there are no holes in the memory map inside + * a pageblock + */ + prev_end = pageblock_align(end); + } + +#ifdef CONFIG_SPARSEMEM + if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { + prev_end = pageblock_align(end); + free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); + } +#endif +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int order; + + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); + + while (start + (1UL << order) > end) + order--; + + memblock_free_pages(pfn_to_page(start), start, order); + + start += (1UL << order); + } +} + +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn >= end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + +static void __init memmap_init_reserved_pages(void) +{ + struct memblock_region *region; + phys_addr_t start, end; + u64 i; + + /* initialize struct pages for the reserved regions */ + for_each_reserved_mem_range(i, &start, &end) + reserve_bootmem_region(start, end); + + /* and also treat struct pages for the NOMAP regions as PageReserved */ + for_each_mem_region(region) { + if (memblock_is_nomap(region)) { + start = region->base; + end = start + region->size; + reserve_bootmem_region(start, end); + } + } +} + +static unsigned long __init free_low_memory_core_early(void) +{ + unsigned long count = 0; + phys_addr_t start, end; + u64 i; + + memblock_clear_hotplug(0, -1); + + memmap_init_reserved_pages(); + + /* + * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + */ + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) + count += __free_memory_core(start, end); + + return count; +} + +static int reset_managed_pages_done __initdata; + +void reset_node_managed_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + atomic_long_set(&z->managed_pages, 0); +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + if (reset_managed_pages_done) + return; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + + reset_managed_pages_done = 1; +} + +/** + * memblock_free_all - release free pages to the buddy allocator + */ +void __init memblock_free_all(void) +{ + unsigned long pages; + + free_unused_memmap(); + reset_all_zones_managed_pages(); + + pages = free_low_memory_core_early(); + totalram_pages_add(pages); +} + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) + +static int memblock_debug_show(struct seq_file *m, void *private) +{ + struct memblock_type *type = m->private; + struct memblock_region *reg; + int i; + phys_addr_t end; + + for (i = 0; i < type->cnt; i++) { + reg = &type->regions[i]; + end = reg->base + reg->size - 1; + + seq_printf(m, "%4d: ", i); + seq_printf(m, "%pa..%pa\n", ®->base, &end); + } + return 0; +} +DEFINE_SHOW_ATTRIBUTE(memblock_debug); + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir("memblock", NULL); + + debugfs_create_file("memory", 0444, root, + &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", 0444, root, + &memblock.reserved, &memblock_debug_fops); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP + debugfs_create_file("physmem", 0444, root, &physmem, + &memblock_debug_fops); +#endif + + return 0; +} +__initcall(memblock_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c new file mode 100644 index 000000000..9da98e3e7 --- /dev/null +++ b/mm/memcontrol.c @@ -0,0 +1,7805 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* memcontrol.c - Memory Controller + * + * Copyright IBM Corporation, 2007 + * Author Balbir Singh + * + * Copyright 2007 OpenVZ SWsoft Inc + * Author: Pavel Emelianov + * + * Memory thresholds + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * + * Kernel Memory Controller + * Copyright (C) 2012 Parallels Inc. and Google Inc. + * Authors: Glauber Costa and Suleiman Souhlal + * + * Native page reclaim + * Charge lifetime sanitation + * Lockless page tracking & accounting + * Unified hierarchy configuration model + * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * Per memcg lru locking + * Copyright (C) 2020 Alibaba, Inc, Alex Shi + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include +#include +#include "slab.h" +#include "swap.h" + +#include + +#include + +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); + +struct mem_cgroup *root_mem_cgroup __read_mostly; + +/* Active memory cgroup to use from an interrupt context */ +DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); +EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); + +/* Socket memory accounting disabled? */ +static bool cgroup_memory_nosocket __ro_after_init; + +/* Kernel memory accounting disabled? */ +static bool cgroup_memory_nokmem __ro_after_init; + +#ifdef CONFIG_CGROUP_WRITEBACK +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); +#endif + +/* Whether legacy memory+swap accounting is active */ +static bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); +} + +#define THRESHOLDS_EVENTS_TARGET 128 +#define SOFTLIMIT_EVENTS_TARGET 1024 + +/* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_node { + struct rb_root rb_root; + struct rb_node *rb_rightmost; + spinlock_t lock; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_entry_t wait; + struct work_struct remove; +}; + +static void mem_cgroup_threshold(struct mem_cgroup *memcg); +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); + +/* Stuffs for move charges at task migration. */ +/* + * Types of charges to be moved. + */ +#define MOVE_ANON 0x1U +#define MOVE_FILE 0x2U +#define MOVE_MASK (MOVE_ANON | MOVE_FILE) + +/* "mc" and its members are protected by cgroup_mutex */ +static struct move_charge_struct { + spinlock_t lock; /* for from, to */ + struct mm_struct *mm; + struct mem_cgroup *from; + struct mem_cgroup *to; + unsigned long flags; + unsigned long precharge; + unsigned long moved_charge; + unsigned long moved_swap; + struct task_struct *moving_task; /* a task moving charges */ + wait_queue_head_t waitq; /* a waitq for other context */ +} mc = { + .lock = __SPIN_LOCK_UNLOCKED(mc.lock), + .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), +}; + +/* + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * limit reclaim to prevent infinite loops, if they ever occur. + */ +#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 + +/* for encoding cft->private value on file */ +enum res_type { + _MEM, + _MEMSWAP, + _KMEM, + _TCP, +}; + +#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) +#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + +static inline bool task_is_dying(void) +{ + return tsk_is_oom_victim(current) || fatal_signal_pending(current) || + (current->flags & PF_EXITING); +} + +/* Some nice accessors for the vmpressure. */ +struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + if (!memcg) + memcg = root_mem_cgroup; + return &memcg->vmpressure; +} + +struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) +{ + return container_of(vmpr, struct mem_cgroup, vmpressure); +} + +#ifdef CONFIG_MEMCG_KMEM +static DEFINE_SPINLOCK(objcg_lock); + +bool mem_cgroup_kmem_disabled(void) +{ + return cgroup_memory_nokmem; +} + +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages); + +static void obj_cgroup_release(struct percpu_ref *ref) +{ + struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); + unsigned int nr_bytes; + unsigned int nr_pages; + unsigned long flags; + + /* + * At this point all allocated objects are freed, and + * objcg->nr_charged_bytes can't have an arbitrary byte value. + * However, it can be PAGE_SIZE or (x * PAGE_SIZE). + * + * The following sequence can lead to it: + * 1) CPU0: objcg == stock->cached_objcg + * 2) CPU1: we do a small allocation (e.g. 92 bytes), + * PAGE_SIZE bytes are charged + * 3) CPU1: a process from another memcg is allocating something, + * the stock if flushed, + * objcg->nr_charged_bytes = PAGE_SIZE - 92 + * 5) CPU0: we do release this object, + * 92 bytes are added to stock->nr_bytes + * 6) CPU0: stock is flushed, + * 92 bytes are added to objcg->nr_charged_bytes + * + * In the result, nr_charged_bytes == PAGE_SIZE. + * This page will be uncharged in obj_cgroup_release(). + */ + nr_bytes = atomic_read(&objcg->nr_charged_bytes); + WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); + nr_pages = nr_bytes >> PAGE_SHIFT; + + if (nr_pages) + obj_cgroup_uncharge_pages(objcg, nr_pages); + + spin_lock_irqsave(&objcg_lock, flags); + list_del(&objcg->list); + spin_unlock_irqrestore(&objcg_lock, flags); + + percpu_ref_exit(ref); + kfree_rcu(objcg, rcu); +} + +static struct obj_cgroup *obj_cgroup_alloc(void) +{ + struct obj_cgroup *objcg; + int ret; + + objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL); + if (!objcg) + return NULL; + + ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0, + GFP_KERNEL); + if (ret) { + kfree(objcg); + return NULL; + } + INIT_LIST_HEAD(&objcg->list); + return objcg; +} + +static void memcg_reparent_objcgs(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + struct obj_cgroup *objcg, *iter; + + objcg = rcu_replace_pointer(memcg->objcg, NULL, true); + + spin_lock_irq(&objcg_lock); + + /* 1) Ready to reparent active objcg. */ + list_add(&objcg->list, &memcg->objcg_list); + /* 2) Reparent active objcg and already reparented objcgs to parent. */ + list_for_each_entry(iter, &memcg->objcg_list, list) + WRITE_ONCE(iter->memcg, parent); + /* 3) Move already reparented objcgs to the parent's list */ + list_splice(&memcg->objcg_list, &parent->objcg_list); + + spin_unlock_irq(&objcg_lock); + + percpu_ref_kill(&objcg->refcnt); +} + +/* + * A lot of the calls to the cache allocation functions are expected to be + * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are + * conditional to this static branch, we'll have to allow modules that does + * kmem_cache_alloc and the such to see this symbol as well + */ +DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); +EXPORT_SYMBOL(memcg_kmem_enabled_key); +#endif + +/** + * mem_cgroup_css_from_page - css of the memcg associated with a page + * @page: page of interest + * + * If memcg is bound to the default hierarchy, css of the memcg associated + * with @page is returned. The returned css remains associated with @page + * until it is released. + * + * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup + * is returned. + */ +struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +{ + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + + if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg = root_mem_cgroup; + + return &memcg->css; +} + +/** + * page_cgroup_ino - return inode number of the memcg a page is charged to + * @page: the page + * + * Look up the closest online ancestor of the memory cgroup @page is charged to + * and return its inode number or 0 if @page is not charged to any cgroup. It + * is safe to call this function without holding a reference to @page. + * + * Note, this function is inherently racy, because there is nothing to prevent + * the cgroup inode from getting torn down and potentially reallocated a moment + * after page_cgroup_ino() returns, so it only should be used by callers that + * do not care (such as procfs interfaces). + */ +ino_t page_cgroup_ino(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned long ino = 0; + + rcu_read_lock(); + memcg = page_memcg_check(page); + + while (memcg && !(memcg->css.flags & CSS_ONLINE)) + memcg = parent_mem_cgroup(memcg); + if (memcg) + ino = cgroup_ino(memcg->css.cgroup); + rcu_read_unlock(); + return ino; +} + +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz, + unsigned long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_node *mz_node; + bool rightmost = true; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_node, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) { + p = &(*p)->rb_left; + rightmost = false; + } else { + p = &(*p)->rb_right; + } + } + + if (rightmost) + mctz->rb_rightmost = &mz->tree_node; + + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) +{ + if (!mz->on_tree) + return; + + if (&mz->tree_node == mctz->rb_rightmost) + mctz->rb_rightmost = rb_prev(&mz->tree_node); + + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) +{ + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + __mem_cgroup_remove_exceeded(mz, mctz); + spin_unlock_irqrestore(&mctz->lock, flags); +} + +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); + unsigned long excess = 0; + + if (nr_pages > soft_limit) + excess = nr_pages - soft_limit; + + return excess; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) +{ + unsigned long excess; + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; + + mctz = soft_limit_tree.rb_tree_per_node[nid]; + if (!mctz) + return; + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = memcg->nodeinfo[nid]; + excess = soft_limit_excess(memcg); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irqrestore(&mctz->lock, flags); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) +{ + struct mem_cgroup_tree_per_node *mctz; + struct mem_cgroup_per_node *mz; + int nid; + + for_each_node(nid) { + mz = memcg->nodeinfo[nid]; + mctz = soft_limit_tree.rb_tree_per_node[nid]; + if (mctz) + mem_cgroup_remove_exceeded(mz, mctz); + } +} + +static struct mem_cgroup_per_node * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) +{ + struct mem_cgroup_per_node *mz; + +retry: + mz = NULL; + if (!mctz->rb_rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(mctz->rb_rightmost, + struct mem_cgroup_per_node, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz, mctz); + if (!soft_limit_excess(mz->memcg) || + !css_tryget(&mz->memcg->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_node * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) +{ + struct mem_cgroup_per_node *mz; + + spin_lock_irq(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock_irq(&mctz->lock); + return mz; +} + +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); +static u64 flush_next_time; + +#define FLUSH_TIME (2UL*HZ) + +/* + * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can + * not rely on this as part of an acquired spinlock_t lock. These functions are + * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion + * is sufficient. + */ +static void memcg_stats_lock(void) +{ + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); +} + +static void __memcg_stats_lock(void) +{ + preempt_disable_nested(); +} + +static void memcg_stats_unlock(void) +{ + preempt_enable_nested(); +} + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +{ + unsigned int x; + + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + /* + * If stats_flush_threshold exceeds the threshold + * (>num_online_cpus()), cgroup stats update will be triggered + * in __mem_cgroup_flush_stats(). Increasing this var further + * is redundant and simply adds overhead in atomic update. + */ + if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } +} + +static void __mem_cgroup_flush_stats(void) +{ + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) + return; + + flush_next_time = jiffies_64 + 2*FLUSH_TIME; + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock_irqrestore(&stats_flush_lock, flag); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +void mem_cgroup_flush_stats_delayed(void) +{ + if (time_after64(jiffies_64, flush_next_time)) + mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + __mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); +} + +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_MEMCG_EVENTS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_MEMCG_EVENTS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_MEMCG_EVENTS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +/** + * __mod_memcg_state - update cgroup memory statistics + * @memcg: the memory cgroup + * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item + * @val: delta to add to the counter, can be negative + */ +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) +{ + if (mem_cgroup_disabled()) + return; + + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + memcg_rstat_updated(memcg, val); +} + +/* idx can be of type enum memcg_stat_item or node_stat_item. */ +static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) +{ + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) +{ + struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + + /* + * The caller from rmap relay on disabled preemption becase they never + * update their counter from in-interrupt context. For these two + * counters we check that the update is never performed from an + * interrupt context while other caller need to have disabled interrupt. + */ + __memcg_stats_lock(); + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + switch (idx) { + case NR_ANON_MAPPED: + case NR_FILE_MAPPED: + case NR_ANON_THPS: + case NR_SHMEM_PMDMAPPED: + case NR_FILE_PMDMAPPED: + WARN_ON_ONCE(!in_task()); + break; + default: + VM_WARN_ON_IRQS_ENABLED(); + } + } + + /* Update memcg */ + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + + /* Update lruvec */ + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg, val); + memcg_stats_unlock(); +} + +/** + * __mod_lruvec_state - update lruvec memory statistics + * @lruvec: the lruvec + * @idx: the stat item + * @val: delta to add to the counter, can be negative + * + * The lruvec is the intersection of the NUMA node and a cgroup. This + * function updates the all three counters that are affected by a + * change of state at this level: per-node, per-cgroup, per-lruvec. + */ +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) +{ + /* Update node */ + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + + /* Update memcg and lruvec */ + if (!mem_cgroup_disabled()) + __mod_memcg_lruvec_state(lruvec, idx, val); +} + +void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, + int val) +{ + struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg; + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = page_memcg(head); + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg) { + rcu_read_unlock(); + __mod_node_page_state(pgdat, idx, val); + return; + } + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + __mod_lruvec_state(lruvec, idx, val); + rcu_read_unlock(); +} +EXPORT_SYMBOL(__mod_lruvec_page_state); + +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) +{ + pg_data_t *pgdat = page_pgdat(virt_to_page(p)); + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = mem_cgroup_from_slab_obj(p); + + /* + * Untracked pages have no memcg, no lruvec. Update only the + * node. If we reparent the slab objects to the root memcg, + * when we free the slab object, we need to update the per-memcg + * vmstats to keep it correct for the root memcg. + */ + if (!memcg) { + __mod_node_page_state(pgdat, idx, val); + } else { + lruvec = mem_cgroup_lruvec(memcg, pgdat); + __mod_lruvec_state(lruvec, idx, val); + } + rcu_read_unlock(); +} + +/** + * __count_memcg_events - account VM events in a cgroup + * @memcg: the memory cgroup + * @idx: the event item + * @count: the number of events that occurred + */ +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count) +{ + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) + return; + + memcg_stats_lock(); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); + memcg_rstat_updated(memcg, count); + memcg_stats_unlock(); +} + +static unsigned long memcg_events(struct mem_cgroup *memcg, int event) +{ + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); +} + +static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) +{ + long x = 0; + int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); + return x; +} + +static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, + int nr_pages) +{ + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) + __count_memcg_events(memcg, PGPGIN, 1); + else { + __count_memcg_events(memcg, PGPGOUT, 1); + nr_pages = -nr_pages; /* for event */ + } + + __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); +} + +static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + enum mem_cgroup_events_target target) +{ + unsigned long val, next; + + val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); + next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); + /* from time_after() in jiffies.h */ + if ((long)(next - val) < 0) { + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + break; + } + __this_cpu_write(memcg->vmstats_percpu->targets[target], next); + return true; + } + return false; +} + +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *memcg, int nid) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return; + + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { + bool do_softlimit; + + do_softlimit = mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_SOFTLIMIT); + mem_cgroup_threshold(memcg); + if (unlikely(do_softlimit)) + mem_cgroup_update_tree(memcg, nid); + } +} + +struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) +{ + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); +} +EXPORT_SYMBOL(mem_cgroup_from_task); + +static __always_inline struct mem_cgroup *active_memcg(void) +{ + if (!in_task()) + return this_cpu_read(int_active_memcg); + else + return current->active_memcg; +} + +/** + * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. + * @mm: mm from which memcg should be extracted. It can be NULL. + * + * Obtain a reference on mm->memcg and returns it if successful. If mm + * is NULL, then the memcg is chosen as follows: + * 1) The active memcg, if set. + * 2) current->mm->memcg, if available + * 3) root memcg + * If mem_cgroup is disabled, NULL is returned. + */ +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; + + /* + * Page cache insertions can happen without an + * actual mm context, e.g. during disk probing + * on boot, loopback IO, acct() writes etc. + * + * No need to css_get on root memcg as the reference + * counting is disabled on the root level in the + * cgroup core. See CSS_NO_REF. + */ + if (unlikely(!mm)) { + memcg = active_memcg(); + if (unlikely(memcg)) { + /* remote memcg must hold a ref */ + css_get(&memcg->css); + return memcg; + } + mm = current->mm; + if (unlikely(!mm)) + return root_mem_cgroup; + } + + rcu_read_lock(); + do { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } while (!css_tryget(&memcg->css)); + rcu_read_unlock(); + return memcg; +} +EXPORT_SYMBOL(get_mem_cgroup_from_mm); + +static __always_inline bool memcg_kmem_bypass(void) +{ + /* Allow remote memcg charging from any context. */ + if (unlikely(active_memcg())) + return false; + + /* Memcg to charge can't be determined. */ + if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + + return false; +} + +/** + * mem_cgroup_iter - iterate over memory cgroup hierarchy + * @root: hierarchy root + * @prev: previously returned memcg, NULL on first invocation + * @reclaim: cookie for shared reclaim walks, NULL for full walks + * + * Returns references to children of the hierarchy below @root, or + * @root itself, or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use mem_cgroup_iter_break() + * to cancel a hierarchy walk before the round-trip is complete. + * + * Reclaimers can specify a node in @reclaim to divide up the memcgs + * in the hierarchy among all concurrent reclaimers operating on the + * same node. + */ +struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, + struct mem_cgroup *prev, + struct mem_cgroup_reclaim_cookie *reclaim) +{ + struct mem_cgroup_reclaim_iter *iter; + struct cgroup_subsys_state *css = NULL; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *pos = NULL; + + if (mem_cgroup_disabled()) + return NULL; + + if (!root) + root = root_mem_cgroup; + + rcu_read_lock(); + + if (reclaim) { + struct mem_cgroup_per_node *mz; + + mz = root->nodeinfo[reclaim->pgdat->node_id]; + iter = &mz->iter; + + /* + * On start, join the current reclaim iteration cycle. + * Exit when a concurrent walker completes it. + */ + if (!prev) + reclaim->generation = iter->generation; + else if (reclaim->generation != iter->generation) + goto out_unlock; + + while (1) { + pos = READ_ONCE(iter->position); + if (!pos || css_tryget(&pos->css)) + break; + /* + * css reference reached zero, so iter->position will + * be cleared by ->css_released. However, we should not + * rely on this happening soon, because ->css_released + * is called from a work queue, and by busy-waiting we + * might block it. So we clear iter->position right + * away. + */ + (void)cmpxchg(&iter->position, pos, NULL); + } + } else if (prev) { + pos = prev; + } + + if (pos) + css = &pos->css; + + for (;;) { + css = css_next_descendant_pre(css, &root->css); + if (!css) { + /* + * Reclaimers share the hierarchy walk, and a + * new one might jump in right at the end of + * the hierarchy - make sure they see at least + * one group and restart from the beginning. + */ + if (!prev) + continue; + break; + } + + /* + * Verify the css and acquire a reference. The root + * is provided by the caller, so we know it's alive + * and kicking, and don't take an extra reference. + */ + if (css == &root->css || css_tryget(css)) { + memcg = mem_cgroup_from_css(css); + break; + } + } + + if (reclaim) { + /* + * The position could have already been updated by a competing + * thread, so check that the value hasn't changed since we read + * it to avoid reclaiming from the same cgroup twice. + */ + (void)cmpxchg(&iter->position, pos, memcg); + + if (pos) + css_put(&pos->css); + + if (!memcg) + iter->generation++; + } + +out_unlock: + rcu_read_unlock(); + if (prev && prev != root) + css_put(&prev->css); + + return memcg; +} + +/** + * mem_cgroup_iter_break - abort a hierarchy walk prematurely + * @root: hierarchy root + * @prev: last visited hierarchy member as returned by mem_cgroup_iter() + */ +void mem_cgroup_iter_break(struct mem_cgroup *root, + struct mem_cgroup *prev) +{ + if (!root) + root = root_mem_cgroup; + if (prev && prev != root) + css_put(&prev->css); +} + +static void __invalidate_reclaim_iterators(struct mem_cgroup *from, + struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup_reclaim_iter *iter; + struct mem_cgroup_per_node *mz; + int nid; + + for_each_node(nid) { + mz = from->nodeinfo[nid]; + iter = &mz->iter; + cmpxchg(&iter->position, dead_memcg, NULL); + } +} + +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup *last; + + do { + __invalidate_reclaim_iterators(memcg, dead_memcg); + last = memcg; + } while ((memcg = parent_mem_cgroup(memcg))); + + /* + * When cgroup1 non-hierarchy mode is used, + * parent_mem_cgroup() does not walk all the way up to the + * cgroup root (root_mem_cgroup). So we have to handle + * dead_memcg from cgroup root separately. + */ + if (last != root_mem_cgroup) + __invalidate_reclaim_iterators(root_mem_cgroup, + dead_memcg); +} + +/** + * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy + * @memcg: hierarchy root + * @fn: function to call for each task + * @arg: argument passed to @fn + * + * This function iterates over tasks attached to @memcg or to any of its + * descendants and calls @fn for each task. If @fn returns a non-zero + * value, the function breaks the iteration loop and returns the value. + * Otherwise, it will iterate over all tasks and return 0. + * + * This function must not be called for the root memory cgroup. + */ +int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, + int (*fn)(struct task_struct *, void *), void *arg) +{ + struct mem_cgroup *iter; + int ret = 0; + + BUG_ON(memcg == root_mem_cgroup); + + for_each_mem_cgroup_tree(iter, memcg) { + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret = fn(task, arg); + css_task_iter_end(&it); + if (ret) { + mem_cgroup_iter_break(memcg, iter); + break; + } + } + return ret; +} + +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + memcg = folio_memcg(folio); + + if (!memcg) + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); + else + VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); +} +#endif + +/** + * folio_lruvec_lock - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held. + */ +struct lruvec *folio_lruvec_lock(struct folio *folio) +{ + struct lruvec *lruvec = folio_lruvec(folio); + + spin_lock(&lruvec->lru_lock); + lruvec_memcg_debug(lruvec, folio); + + return lruvec; +} + +/** + * folio_lruvec_lock_irq - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irq(struct folio *folio) +{ + struct lruvec *lruvec = folio_lruvec(folio); + + spin_lock_irq(&lruvec->lru_lock); + lruvec_memcg_debug(lruvec, folio); + + return lruvec; +} + +/** + * folio_lruvec_lock_irqsave - Lock the lruvec for a folio. + * @folio: Pointer to the folio. + * @flags: Pointer to irqsave flags. + * + * These functions are safe to use under any of the following conditions: + * - folio locked + * - folio_test_lru false + * - folio_memcg_lock() + * - folio frozen (refcount of 0) + * + * Return: The lruvec this folio is on with its lock held and interrupts + * disabled. + */ +struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, + unsigned long *flags) +{ + struct lruvec *lruvec = folio_lruvec(folio); + + spin_lock_irqsave(&lruvec->lru_lock, *flags); + lruvec_memcg_debug(lruvec, folio); + + return lruvec; +} + +/** + * mem_cgroup_update_lru_size - account for adding or removing an lru page + * @lruvec: mem_cgroup per zone lru vector + * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages + * @nr_pages: positive when adding or negative when removing + * + * This function must be called under lru_lock, just before a page is added + * to or just after a page is removed from an lru list. + */ +void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zid, int nr_pages) +{ + struct mem_cgroup_per_node *mz; + unsigned long *lru_size; + long size; + + if (mem_cgroup_disabled()) + return; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + lru_size = &mz->lru_zone_size[zid][lru]; + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { + VM_BUG_ON(1); + *lru_size = 0; + } + + if (nr_pages > 0) + *lru_size += nr_pages; +} + +/** + * mem_cgroup_margin - calculate chargeable space of a memory cgroup + * @memcg: the memory cgroup + * + * Returns the maximum amount of memory @mem can be charged with, in + * pages. + */ +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) +{ + unsigned long margin = 0; + unsigned long count; + unsigned long limit; + + count = page_counter_read(&memcg->memory); + limit = READ_ONCE(memcg->memory.max); + if (count < limit) + margin = limit - count; + + if (do_memsw_account()) { + count = page_counter_read(&memcg->memsw); + limit = READ_ONCE(memcg->memsw.max); + if (count < limit) + margin = min(margin, limit - count); + else + margin = 0; + } + + return margin; +} + +/* + * A routine for checking "mem" is under move_account() or not. + * + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". + */ +static bool mem_cgroup_under_move(struct mem_cgroup *memcg) +{ + struct mem_cgroup *from; + struct mem_cgroup *to; + bool ret = false; + /* + * Unlike task_move routines, we access mc.to, mc.from not under + * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. + */ + spin_lock(&mc.lock); + from = mc.from; + to = mc.to; + if (!from) + goto unlock; + + ret = mem_cgroup_is_descendant(from, memcg) || + mem_cgroup_is_descendant(to, memcg); +unlock: + spin_unlock(&mc.lock); + return ret; +} + +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) +{ + if (mc.moving_task && current != mc.moving_task) { + if (mem_cgroup_under_move(memcg)) { + DEFINE_WAIT(wait); + prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); + /* moving charge context might have finished. */ + if (mc.moving_task) + schedule(); + finish_wait(&mc.waitq, &wait); + return true; + } + } + return false; +} + +struct memory_stat { + const char *name; + unsigned int idx; +}; + +static const struct memory_stat memory_stats[] = { + { "anon", NR_ANON_MAPPED }, + { "file", NR_FILE_PAGES }, + { "kernel", MEMCG_KMEM }, + { "kernel_stack", NR_KERNEL_STACK_KB }, + { "pagetables", NR_PAGETABLE }, + { "sec_pagetables", NR_SECONDARY_PAGETABLE }, + { "percpu", MEMCG_PERCPU_B }, + { "sock", MEMCG_SOCK }, + { "vmalloc", MEMCG_VMALLOC }, + { "shmem", NR_SHMEM }, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + { "zswap", MEMCG_ZSWAP_B }, + { "zswapped", MEMCG_ZSWAPPED }, +#endif + { "file_mapped", NR_FILE_MAPPED }, + { "file_dirty", NR_FILE_DIRTY }, + { "file_writeback", NR_WRITEBACK }, +#ifdef CONFIG_SWAP + { "swapcached", NR_SWAPCACHE }, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + { "anon_thp", NR_ANON_THPS }, + { "file_thp", NR_FILE_THPS }, + { "shmem_thp", NR_SHMEM_THPS }, +#endif + { "inactive_anon", NR_INACTIVE_ANON }, + { "active_anon", NR_ACTIVE_ANON }, + { "inactive_file", NR_INACTIVE_FILE }, + { "active_file", NR_ACTIVE_FILE }, + { "unevictable", NR_UNEVICTABLE }, + { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, + { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, + + /* The memory events */ + { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, + { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, + { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, + { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, + { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, + { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, + { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, +}; + +/* Translate stat items to the correct unit for memory.stat output */ +static int memcg_page_state_unit(int item) +{ + switch (item) { + case MEMCG_PERCPU_B: + case MEMCG_ZSWAP_B: + case NR_SLAB_RECLAIMABLE_B: + case NR_SLAB_UNRECLAIMABLE_B: + case WORKINGSET_REFAULT_ANON: + case WORKINGSET_REFAULT_FILE: + case WORKINGSET_ACTIVATE_ANON: + case WORKINGSET_ACTIVATE_FILE: + case WORKINGSET_RESTORE_ANON: + case WORKINGSET_RESTORE_FILE: + case WORKINGSET_NODERECLAIM: + return 1; + case NR_KERNEL_STACK_KB: + return SZ_1K; + default: + return PAGE_SIZE; + } +} + +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, + int item) +{ + return memcg_page_state(memcg, item) * memcg_page_state_unit(item); +} + +static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) +{ + struct seq_buf s; + int i; + + seq_buf_init(&s, buf, bufsize); + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + mem_cgroup_flush_stats(); + + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { + u64 size; + + size = memcg_page_state_output(memcg, memory_stats[i].idx); + seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); + + if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { + size += memcg_page_state_output(memcg, + NR_SLAB_RECLAIMABLE_B); + seq_buf_printf(&s, "slab %llu\n", size); + } + } + + /* Accumulated memory events */ + seq_buf_printf(&s, "pgscan %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_buf_printf(&s, "pgsteal %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; + + seq_buf_printf(&s, "%s %lu\n", + vm_event_name(memcg_vm_event_stat[i]), + memcg_events(memcg, memcg_vm_event_stat[i])); + } + + /* The above should easily fit into one page */ + WARN_ON_ONCE(seq_buf_has_overflowed(&s)); +} + +#define K(x) ((x) << (PAGE_SHIFT-10)) +/** + * mem_cgroup_print_oom_context: Print OOM information relevant to + * memory controller. + * @memcg: The memory cgroup that went over limit + * @p: Task that is going to be killed + * + * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is + * enabled + */ +void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) +{ + rcu_read_lock(); + + if (memcg) { + pr_cont(",oom_memcg="); + pr_cont_cgroup_path(memcg->css.cgroup); + } else + pr_cont(",global_oom"); + if (p) { + pr_cont(",task_memcg="); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + } + rcu_read_unlock(); +} + +/** + * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to + * memory controller. + * @memcg: The memory cgroup that went over limit + */ +void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) +{ + /* Use static buffer, for the caller is holding oom_lock. */ + static char buf[PAGE_SIZE]; + + lockdep_assert_held(&oom_lock); + + pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memory)), + K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->swap)), + K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt); + else { + pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->memsw)), + K((u64)memcg->memsw.max), memcg->memsw.failcnt); + pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", + K((u64)page_counter_read(&memcg->kmem)), + K((u64)memcg->kmem.max), memcg->kmem.failcnt); + } + + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(":"); + memory_stat_format(memcg, buf, sizeof(buf)); + pr_info("%s", buf); +} + +/* + * Return the memory (and swap, if configured) limit for a memcg. + */ +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) +{ + unsigned long max = READ_ONCE(memcg->memory.max); + + if (do_memsw_account()) { + if (mem_cgroup_swappiness(memcg)) { + /* Calculate swap excess capacity from memsw limit */ + unsigned long swap = READ_ONCE(memcg->memsw.max) - max; + + max += min(swap, (unsigned long)total_swap_pages); + } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); + } + return max; +} + +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) +{ + struct oom_control oc = { + .zonelist = NULL, + .nodemask = NULL, + .memcg = memcg, + .gfp_mask = gfp_mask, + .order = order, + }; + bool ret = true; + + if (mutex_lock_killable(&oom_lock)) + return true; + + if (mem_cgroup_margin(memcg) >= (1 << order)) + goto unlock; + + /* + * A few threads which were not waiting at mutex_lock_killable() can + * fail to bail out. Therefore, check again after holding oom_lock. + */ + ret = task_is_dying() || out_of_memory(&oc); + +unlock: + mutex_unlock(&oom_lock); + return ret; +} + +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, + pg_data_t *pgdat, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + struct mem_cgroup *victim = NULL; + int total = 0; + int loop = 0; + unsigned long excess; + unsigned long nr_scanned; + struct mem_cgroup_reclaim_cookie reclaim = { + .pgdat = pgdat, + }; + + excess = soft_limit_excess(root_memcg); + + while (1) { + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); + if (!victim) { + loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!total) + break; + /* + * We want to do more targeted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) + break; + } + continue; + } + total += mem_cgroup_shrink_node(victim, gfp_mask, false, + pgdat, &nr_scanned); + *total_scanned += nr_scanned; + if (!soft_limit_excess(root_memcg)) + break; + } + mem_cgroup_iter_break(root_memcg, victim); + return total; +} + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map memcg_oom_lock_dep_map = { + .name = "memcg_oom_lock", +}; +#endif + +static DEFINE_SPINLOCK(memcg_oom_lock); + +/* + * Check OOM-Killer is already running under our hierarchy. + * If someone is running, return false. + */ +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter, *failed = NULL; + + spin_lock(&memcg_oom_lock); + + for_each_mem_cgroup_tree(iter, memcg) { + if (iter->oom_lock) { + /* + * this subtree of our hierarchy is already locked + * so we cannot give a lock. + */ + failed = iter; + mem_cgroup_iter_break(memcg, iter); + break; + } else + iter->oom_lock = true; + } + + if (failed) { + /* + * OK, we failed to lock the whole subtree so we have + * to clean up what we set up to the failing subtree + */ + for_each_mem_cgroup_tree(iter, memcg) { + if (iter == failed) { + mem_cgroup_iter_break(memcg, iter); + break; + } + iter->oom_lock = false; + } + } else + mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); + + spin_unlock(&memcg_oom_lock); + + return !failed; +} + +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + spin_lock(&memcg_oom_lock); + mutex_release(&memcg_oom_lock_dep_map, _RET_IP_); + for_each_mem_cgroup_tree(iter, memcg) + iter->oom_lock = false; + spin_unlock(&memcg_oom_lock); +} + +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) + iter->under_oom++; + spin_unlock(&memcg_oom_lock); +} + +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + /* + * Be careful about under_oom underflows because a child memcg + * could have been added after mem_cgroup_mark_under_oom. + */ + spin_lock(&memcg_oom_lock); + for_each_mem_cgroup_tree(iter, memcg) + if (iter->under_oom > 0) + iter->under_oom--; + spin_unlock(&memcg_oom_lock); +} + +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); + +struct oom_wait_info { + struct mem_cgroup *memcg; + wait_queue_entry_t wait; +}; + +static int memcg_oom_wake_function(wait_queue_entry_t *wait, + unsigned mode, int sync, void *arg) +{ + struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; + struct mem_cgroup *oom_wait_memcg; + struct oom_wait_info *oom_wait_info; + + oom_wait_info = container_of(wait, struct oom_wait_info, wait); + oom_wait_memcg = oom_wait_info->memcg; + + if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && + !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) + return 0; + return autoremove_wake_function(wait, mode, sync, arg); +} + +static void memcg_oom_recover(struct mem_cgroup *memcg) +{ + /* + * For the following lockless ->under_oom test, the only required + * guarantee is that it must see the state asserted by an OOM when + * this function is called as a result of userland actions + * triggered by the notification of the OOM. This is trivially + * achieved by invoking mem_cgroup_mark_under_oom() before + * triggering notification. + */ + if (memcg && memcg->under_oom) + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); +} + +/* + * Returns true if successfully killed one or more processes. Though in some + * corner cases it can return true even without killing any process. + */ +static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) +{ + bool locked, ret; + + if (order > PAGE_ALLOC_COSTLY_ORDER) + return false; + + memcg_memory_event(memcg, MEMCG_OOM); + + /* + * We are in the middle of the charge context here, so we + * don't want to block when potentially sitting on a callstack + * that holds all kinds of filesystem and mm locks. + * + * cgroup1 allows disabling the OOM killer and waiting for outside + * handling until the charge can succeed; remember the context and put + * the task to sleep at the end of the page fault when all locks are + * released. + * + * On the other hand, in-kernel OOM killer allows for an async victim + * memory reclaim (oom_reaper) and that means that we are not solely + * relying on the oom victim to make a forward progress and we can + * invoke the oom killer here. + * + * Please note that mem_cgroup_out_of_memory might fail to find a + * victim and then we have to bail out from the charge path. + */ + if (memcg->oom_kill_disable) { + if (current->in_user_fault) { + css_get(&memcg->css); + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; + } + return false; + } + + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + mem_cgroup_unmark_under_oom(memcg); + ret = mem_cgroup_out_of_memory(memcg, mask, order); + + if (locked) + mem_cgroup_oom_unlock(memcg); + + return ret; +} + +/** + * mem_cgroup_oom_synchronize - complete memcg OOM handling + * @handle: actually kill/wait or just clean up the OOM state + * + * This has to be called at the end of a page fault if the memcg OOM + * handler was enabled. + * + * Memcg supports userspace OOM handling where failed allocations must + * sleep on a waitqueue until the userspace task resolves the + * situation. Sleeping directly in the charge context with all kinds + * of locks held is not a good idea, instead we remember an OOM state + * in the task and mem_cgroup_oom_synchronize() has to be called at + * the end of the page fault to complete the OOM handling. + * + * Returns %true if an ongoing memcg OOM situation was detected and + * completed, %false otherwise. + */ +bool mem_cgroup_oom_synchronize(bool handle) +{ + struct mem_cgroup *memcg = current->memcg_in_oom; + struct oom_wait_info owait; + bool locked; + + /* OOM is global, do not handle */ + if (!memcg) + return false; + + if (!handle) + goto cleanup; + + owait.memcg = memcg; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.entry); + + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + mem_cgroup_mark_under_oom(memcg); + + locked = mem_cgroup_oom_trylock(memcg); + + if (locked) + mem_cgroup_oom_notify(memcg); + + if (locked && !memcg->oom_kill_disable) { + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, + current->memcg_oom_order); + } else { + schedule(); + mem_cgroup_unmark_under_oom(memcg); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + + if (locked) { + mem_cgroup_oom_unlock(memcg); + /* + * There is no guarantee that an OOM-lock contender + * sees the wakeups triggered by the OOM kill + * uncharges. Wake any sleepers explicitly. + */ + memcg_oom_recover(memcg); + } +cleanup: + current->memcg_in_oom = NULL; + css_put(&memcg->css); + return true; +} + +/** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain) +{ + struct mem_cgroup *oom_group = NULL; + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return NULL; + + if (!oom_domain) + oom_domain = root_mem_cgroup; + + rcu_read_lock(); + + memcg = mem_cgroup_from_task(victim); + if (memcg == root_mem_cgroup) + goto out; + + /* + * If the victim task has been asynchronously moved to a different + * memory cgroup, we might end up killing tasks outside oom_domain. + * In this case it's better to ignore memory.group.oom. + */ + if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain))) + goto out; + + /* + * Traverse the memory cgroup hierarchy from the victim task's + * cgroup up to the OOMing cgroup (or root) to find the + * highest-level memory cgroup with oom.group set. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + if (memcg->oom_group) + oom_group = memcg; + + if (memcg == oom_domain) + break; + } + + if (oom_group) + css_get(&oom_group->css); +out: + rcu_read_unlock(); + + return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ + pr_info("Tasks in "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + +/** + * folio_memcg_lock - Bind a folio to its memcg. + * @folio: The folio. + * + * This function prevents unlocked LRU folios from being moved to + * another cgroup. + * + * It ensures lifetime of the bound memcg. The caller is responsible + * for the lifetime of the folio. + */ +void folio_memcg_lock(struct folio *folio) +{ + struct mem_cgroup *memcg; + unsigned long flags; + + /* + * The RCU lock is held throughout the transaction. The fast + * path can get away without acquiring the memcg->move_lock + * because page moving starts with an RCU grace period. + */ + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return; +again: + memcg = folio_memcg(folio); + if (unlikely(!memcg)) + return; + +#ifdef CONFIG_PROVE_LOCKING + local_irq_save(flags); + might_lock(&memcg->move_lock); + local_irq_restore(flags); +#endif + + if (atomic_read(&memcg->moving_account) <= 0) + return; + + spin_lock_irqsave(&memcg->move_lock, flags); + if (memcg != folio_memcg(folio)) { + spin_unlock_irqrestore(&memcg->move_lock, flags); + goto again; + } + + /* + * When charge migration first begins, we can have multiple + * critical sections holding the fast-path RCU lock and one + * holding the slowpath move_lock. Track the task who has the + * move_lock for unlock_page_memcg(). + */ + memcg->move_lock_task = current; + memcg->move_lock_flags = flags; +} + +void lock_page_memcg(struct page *page) +{ + folio_memcg_lock(page_folio(page)); +} + +static void __folio_memcg_unlock(struct mem_cgroup *memcg) +{ + if (memcg && memcg->move_lock_task == current) { + unsigned long flags = memcg->move_lock_flags; + + memcg->move_lock_task = NULL; + memcg->move_lock_flags = 0; + + spin_unlock_irqrestore(&memcg->move_lock, flags); + } + + rcu_read_unlock(); +} + +/** + * folio_memcg_unlock - Release the binding between a folio and its memcg. + * @folio: The folio. + * + * This releases the binding created by folio_memcg_lock(). This does + * not change the accounting of this folio to its memcg, but it does + * permit others to change it. + */ +void folio_memcg_unlock(struct folio *folio) +{ + __folio_memcg_unlock(folio_memcg(folio)); +} + +void unlock_page_memcg(struct page *page) +{ + folio_memcg_unlock(page_folio(page)); +} + +struct memcg_stock_pcp { + local_lock_t stock_lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cached_objcg; + struct pglist_data *cached_pgdat; + unsigned int nr_bytes; + int nr_slab_reclaimable_b; + int nr_slab_unreclaimable_b; +#endif + + struct work_struct work; + unsigned long flags; +#define FLUSHING_CACHED_CHARGE 0 +}; +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { + .stock_lock = INIT_LOCAL_LOCK(stock_lock), +}; +static DEFINE_MUTEX(percpu_charge_mutex); + +#ifdef CONFIG_MEMCG_KMEM +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg); +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); + +#else +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +{ + return NULL; +} +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +{ + return false; +} +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ +} +#endif + +/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. + * + * The charges will only happen if @memcg matches the current cpu's memcg + * stock, and at least @nr_pages are available in that stock. Failure to + * service an allocation will refill the stock. + * + * returns true if successful, false otherwise. + */ +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + struct memcg_stock_pcp *stock; + unsigned long flags; + bool ret = false; + + if (nr_pages > MEMCG_CHARGE_BATCH) + return ret; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { + stock->nr_pages -= nr_pages; + ret = true; + } + + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; +} + +/* + * Returns stocks cached in percpu and reset cached information. + */ +static void drain_stock(struct memcg_stock_pcp *stock) +{ + struct mem_cgroup *old = stock->cached; + + if (!old) + return; + + if (stock->nr_pages) { + page_counter_uncharge(&old->memory, stock->nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&old->memsw, stock->nr_pages); + stock->nr_pages = 0; + } + + css_put(&old->css); + stock->cached = NULL; +} + +static void drain_local_stock(struct work_struct *dummy) +{ + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; + unsigned long flags; + + /* + * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs. + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled + */ + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + old = drain_obj_stock(stock); + drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); +} + +/* + * Cache charges(val) to local per_cpu area. + * This will be consumed by consume_stock() function, later. + */ +static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + struct memcg_stock_pcp *stock; + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached != memcg) { /* reset if necessary */ + drain_stock(stock); + css_get(&memcg->css); + stock->cached = memcg; + } + stock->nr_pages += nr_pages; + + if (stock->nr_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock); +} + +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + unsigned long flags; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + __refill_stock(memcg, nr_pages); + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); +} + +/* + * Drains all per-CPU charge caches for given root_memcg resp. subtree + * of the hierarchy under it. + */ +static void drain_all_stock(struct mem_cgroup *root_memcg) +{ + int cpu, curcpu; + + /* If someone's already draining, avoid adding running more workers. */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; + /* + * Notify other cpus that system-wide "drain" is running + * We do not care about races with the cpu hotplug because cpu down + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ + migrate_disable(); + curcpu = smp_processor_id(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; + bool flush = false; + + rcu_read_lock(); + memcg = stock->cached; + if (memcg && stock->nr_pages && + mem_cgroup_is_descendant(memcg, root_memcg)) + flush = true; + else if (obj_stock_flush_required(stock, root_memcg)) + flush = true; + rcu_read_unlock(); + + if (flush && + !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { + if (cpu == curcpu) + drain_local_stock(&stock->work); + else + schedule_work_on(cpu, &stock->work); + } + } + migrate_enable(); + mutex_unlock(&percpu_charge_mutex); +} + +static int memcg_hotplug_cpu_dead(unsigned int cpu) +{ + struct memcg_stock_pcp *stock; + + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); + + return 0; +} + +static unsigned long reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + unsigned long nr_reclaimed = 0; + + do { + unsigned long pflags; + + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) + continue; + + memcg_memory_event(memcg, MEMCG_HIGH); + + psi_memstall_enter(&pflags); + nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, + gfp_mask, + MEMCG_RECLAIM_MAY_SWAP); + psi_memstall_leave(&pflags); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return nr_reclaimed; +} + +static void high_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); +} + +/* + * Clamp the maximum sleep time per allocation batch to 2 seconds. This is + * enough to still cause a significant slowdown in most cases, while still + * allowing diagnostics and tracing to proceed without becoming stuck. + */ +#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) + +/* + * When calculating the delay, we use these either side of the exponentiation to + * maintain precision and scale to a reasonable number of jiffies (see the table + * below. + * + * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the + * overage ratio to a delay. + * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the + * proposed penalty in order to reduce to a reasonable number of jiffies, and + * to produce a reasonable delay curve. + * + * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a + * reasonable delay curve compared to precision-adjusted overage, not + * penalising heavily at first, but still making sure that growth beyond the + * limit penalises misbehaviour cgroups by slowing them down exponentially. For + * example, with a high of 100 megabytes: + * + * +-------+------------------------+ + * | usage | time to allocate in ms | + * +-------+------------------------+ + * | 100M | 0 | + * | 101M | 6 | + * | 102M | 25 | + * | 103M | 57 | + * | 104M | 102 | + * | 105M | 159 | + * | 106M | 230 | + * | 107M | 313 | + * | 108M | 409 | + * | 109M | 518 | + * | 110M | 639 | + * | 111M | 774 | + * | 112M | 921 | + * | 113M | 1081 | + * | 114M | 1254 | + * | 115M | 1439 | + * | 116M | 1638 | + * | 117M | 1849 | + * | 118M | 2000 | + * | 119M | 2000 | + * | 120M | 2000 | + * +-------+------------------------+ + */ + #define MEMCG_DELAY_PRECISION_SHIFT 20 + #define MEMCG_DELAY_SCALING_SHIFT 14 + +static u64 calculate_overage(unsigned long usage, unsigned long high) +{ + u64 overage; + + if (usage <= high) + return 0; + + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); + + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + return div64_u64(overage, high); +} + +static u64 mem_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->memory), + READ_ONCE(memcg->memory.high)); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + +/* + * Get the number of jiffies that we should penalise a mischievous cgroup which + * is exceeding its memory.high by checking both it and its ancestors. + */ +static unsigned long calculate_high_delay(struct mem_cgroup *memcg, + unsigned int nr_pages, + u64 max_overage) +{ + unsigned long penalty_jiffies; + + if (!max_overage) + return 0; + + /* + * We use overage compared to memory.high to calculate the number of + * jiffies to sleep (penalty_jiffies). Ideally this value should be + * fairly lenient on small overages, and increasingly harsh when the + * memcg in question makes it clear that it has no intention of stopping + * its crazy behaviour, so we exponentially increase the delay based on + * overage amount. + */ + penalty_jiffies = max_overage * max_overage * HZ; + penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT; + penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT; + + /* + * Factor in the task's own contribution to the overage, such that four + * N-sized allocations are throttled approximately the same as one + * 4N-sized allocation. + * + * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or + * larger the current charge patch is than that. + */ + return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; +} + +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(gfp_t gfp_mask) +{ + unsigned long penalty_jiffies; + unsigned long pflags; + unsigned long nr_reclaimed; + unsigned int nr_pages = current->memcg_nr_pages_over_high; + int nr_retries = MAX_RECLAIM_RETRIES; + struct mem_cgroup *memcg; + bool in_retry = false; + + if (likely(!nr_pages)) + return; + + memcg = get_mem_cgroup_from_mm(current->mm); + current->memcg_nr_pages_over_high = 0; + +retry_reclaim: + /* + * The allocating task should reclaim at least the batch size, but for + * subsequent retries we only want to do what's necessary to prevent oom + * or breaching resource isolation. + * + * This is distinct from memory.max or page allocator behaviour because + * memory.high is currently batched, whereas memory.max and the page + * allocator run every time an allocation is made. + */ + nr_reclaimed = reclaim_high(memcg, + in_retry ? SWAP_CLUSTER_MAX : nr_pages, + gfp_mask); + + /* + * memory.high is breached and reclaim is unable to keep up. Throttle + * allocators proactively to slow down excessive growth. + */ + penalty_jiffies = calculate_high_delay(memcg, nr_pages, + mem_find_max_overage(memcg)); + + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + + /* + * Don't sleep if the amount of jiffies this memcg owes us is so low + * that it's not even worth doing, in an attempt to be nice to those who + * go only a small amount over their memory.high value and maybe haven't + * been aggressively reclaimed enough yet. + */ + if (penalty_jiffies <= HZ / 100) + goto out; + + /* + * If reclaim is making forward progress but we're still over + * memory.high, we want to encourage that rather than doing allocator + * throttling. + */ + if (nr_reclaimed || nr_retries--) { + in_retry = true; + goto retry_reclaim; + } + + /* + * If we exit early, we're guaranteed to die (since + * schedule_timeout_killable sets TASK_KILLABLE). This means we don't + * need to account for any ill-begotten jiffies to pay them off later. + */ + psi_memstall_enter(&pflags); + schedule_timeout_killable(penalty_jiffies); + psi_memstall_leave(&pflags); + +out: + css_put(&memcg->css); +} + +static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) +{ + unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); + int nr_retries = MAX_RECLAIM_RETRIES; + struct mem_cgroup *mem_over_limit; + struct page_counter *counter; + unsigned long nr_reclaimed; + bool passed_oom = false; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; + bool drained = false; + bool raised_max_event = false; + unsigned long pflags; + +retry: + if (consume_stock(memcg, nr_pages)) + return 0; + + if (!do_memsw_account() || + page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (page_counter_try_charge(&memcg->memory, batch, &counter)) + goto done_restock; + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, batch); + mem_over_limit = mem_cgroup_from_counter(counter, memory); + } else { + mem_over_limit = mem_cgroup_from_counter(counter, memsw); + reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; + } + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + + /* + * Prevent unbounded recursion when reclaim operations need to + * allocate memory. This might exceed the limits temporarily, + * but we prefer facilitating memory reclaim and getting back + * under the limit over triggering OOM kills in these cases. + */ + if (unlikely(current->flags & PF_MEMALLOC)) + goto force; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; + + if (!gfpflags_allow_blocking(gfp_mask)) + goto nomem; + + memcg_memory_event(mem_over_limit, MEMCG_MAX); + raised_max_event = true; + + psi_memstall_enter(&pflags); + nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, + gfp_mask, reclaim_options); + psi_memstall_leave(&pflags); + + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) + goto retry; + + if (!drained) { + drain_all_stock(mem_over_limit); + drained = true; + goto retry; + } + + if (gfp_mask & __GFP_NORETRY) + goto nomem; + /* + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. + */ + if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + goto retry; + + if (nr_retries--) + goto retry; + + if (gfp_mask & __GFP_RETRY_MAYFAIL) + goto nomem; + + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; + + /* + * keep retrying as long as the memcg oom killer is able to make + * a forward progress or bypass the charge if the oom killer + * couldn't make any progress. + */ + if (mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE))) { + passed_oom = true; + nr_retries = MAX_RECLAIM_RETRIES; + goto retry; + } +nomem: + /* + * Memcg doesn't have a dedicated reserve for atomic + * allocations. But like the global atomic pool, we need to + * put the burden of reclaim on regular allocation requests + * and let these go through as privileged allocations. + */ + if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) + return -ENOMEM; +force: + /* + * If the allocation has to be enforced, don't forget to raise + * a MEMCG_MAX event. + */ + if (!raised_max_event) + memcg_memory_event(mem_over_limit, MEMCG_MAX); + + /* + * The allocation either can't fail or will lead to more memory + * being freed very soon. Allow memory usage go over the limit + * temporarily by force charging it. + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + + return 0; + +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); + + /* + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_RECLAIM but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. + */ + do { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (!in_task()) { + if (mem_high) { + schedule_work(&memcg->high_work); + break; + } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ + current->memcg_nr_pages_over_high += batch; + set_notify_resume(current); + break; + } + } while ((memcg = parent_mem_cgroup(memcg))); + + if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && + !(current->flags & PF_MEMALLOC) && + gfpflags_allow_blocking(gfp_mask)) { + mem_cgroup_handle_over_high(gfp_mask); + } + return 0; +} + +static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) +{ + if (mem_cgroup_is_root(memcg)) + return 0; + + return try_charge_memcg(memcg, gfp_mask, nr_pages); +} + +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (mem_cgroup_is_root(memcg)) + return; + + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + +static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) +{ + VM_BUG_ON_FOLIO(folio_memcg(folio), folio); + /* + * Any of the following ensures page's memcg stability: + * + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + * - mem_cgroup_trylock_pages() + */ + folio->memcg_data = (unsigned long)memcg; +} + +#ifdef CONFIG_MEMCG_KMEM +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + +/* + * mod_objcg_mlstate() may be called with irq enabled, so + * mod_memcg_lruvec_state() should be used. + */ +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + mod_memcg_lruvec_state(lruvec, idx, nr); + rcu_read_unlock(); +} + +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + unsigned int objects = objs_per_slab(s, slab); + unsigned long memcg_data; + void *vec; + + gfp &= ~OBJCGS_CLEAR_MASK; + vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, + slab_nid(slab)); + if (!vec) + return -ENOMEM; + + memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * memcg_data, no synchronization is required and memcg_data can + * be simply assigned. + */ + slab->memcg_data = memcg_data; + } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { + /* + * If the slab is already in use, somebody can allocate and + * assign obj_cgroups in parallel. In this case the existing + * objcg vector should be reused. + */ + kfree(vec); + return 0; + } + + kmemleak_not_leak(vec); + return 0; +} + +static __always_inline +struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) +{ + /* + * Slab objects are accounted individually, not per-page. + * Memcg membership data for each individual object is saved in + * slab->memcg_data. + */ + if (folio_test_slab(folio)) { + struct obj_cgroup **objcgs; + struct slab *slab; + unsigned int off; + + slab = folio_slab(folio); + objcgs = slab_objcgs(slab); + if (!objcgs) + return NULL; + + off = obj_to_index(slab->slab_cache, slab, p); + if (objcgs[off]) + return obj_cgroup_memcg(objcgs[off]); + + return NULL; + } + + /* + * page_memcg_check() is used here, because in theory we can encounter + * a folio where the slab flag has been cleared already, but + * slab->memcg_data has not been freed yet + * page_memcg_check(page) will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ + return page_memcg_check(folio_page(folio, 0)); +} + +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * + * A passed kernel object can be a slab object, vmalloc object or a generic + * kernel page, so different mechanisms for getting the memory cgroup pointer + * should be used. + * + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_obj(void *p) +{ + struct folio *folio; + + if (mem_cgroup_disabled()) + return NULL; + + if (unlikely(is_vmalloc_addr(p))) + folio = page_folio(vmalloc_to_page(p)); + else + folio = virt_to_folio(p); + + return mem_cgroup_from_obj_folio(folio, p); +} + +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, + * allocated using vmalloc(). + * + * A passed kernel object must be a slab object or a generic kernel page. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +{ + if (mem_cgroup_disabled()) + return NULL; + + return mem_cgroup_from_obj_folio(virt_to_folio(p), p); +} + +static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg = NULL; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + objcg = rcu_dereference(memcg->objcg); + if (objcg && obj_cgroup_tryget(objcg)) + break; + objcg = NULL; + } + return objcg; +} + +__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) +{ + struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg; + + if (memcg_kmem_bypass()) + return NULL; + + rcu_read_lock(); + if (unlikely(active_memcg())) + memcg = active_memcg(); + else + memcg = mem_cgroup_from_task(current); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + return objcg; +} + +struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_enabled()) + return NULL; + + if (PageMemcgKmem(page)) { + objcg = __folio_objcg(page_folio(page)); + obj_cgroup_get(objcg); + } else { + struct mem_cgroup *memcg; + + rcu_read_lock(); + memcg = __folio_memcg(page_folio(page)); + if (memcg) + objcg = __get_obj_cgroup_from_memcg(memcg); + else + objcg = NULL; + rcu_read_unlock(); + } + return objcg; +} + +static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages) +{ + mod_memcg_state(memcg, MEMCG_KMEM, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + if (nr_pages > 0) + page_counter_charge(&memcg->kmem, nr_pages); + else + page_counter_uncharge(&memcg->kmem, -nr_pages); + } +} + + +/* + * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg + * @objcg: object cgroup to uncharge + * @nr_pages: number of pages to uncharge + */ +static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, + unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + + memcg_account_kmem(memcg, -nr_pages); + refill_stock(memcg, nr_pages); + + css_put(&memcg->css); +} + +/* + * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg + * @objcg: object cgroup to charge + * @gfp: reclaim mode + * @nr_pages: number of pages to charge + * + * Returns 0 on success, an error code on failure. + */ +static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, + unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_objcg(objcg); + + ret = try_charge_memcg(memcg, gfp, nr_pages); + if (ret) + goto out; + + memcg_account_kmem(memcg, nr_pages); +out: + css_put(&memcg->css); + + return ret; +} + +/** + * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * + * Returns 0 on success, an error code on failure. + */ +int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) +{ + struct obj_cgroup *objcg; + int ret = 0; + + objcg = get_obj_cgroup_from_current(); + if (objcg) { + ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); + if (!ret) { + page->memcg_data = (unsigned long)objcg | + MEMCG_DATA_KMEM; + return 0; + } + obj_cgroup_put(objcg); + } + return ret; +} + +/** + * __memcg_kmem_uncharge_page: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order + */ +void __memcg_kmem_uncharge_page(struct page *page, int order) +{ + struct folio *folio = page_folio(page); + struct obj_cgroup *objcg; + unsigned int nr_pages = 1 << order; + + if (!folio_memcg_kmem(folio)) + return; + + objcg = __folio_objcg(folio); + obj_cgroup_uncharge_pages(objcg, nr_pages); + folio->memcg_data = 0; + obj_cgroup_put(objcg); +} + +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; + unsigned long flags; + int *bytes; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + stock = this_cpu_ptr(&memcg_stock); + + /* + * Save vmstat data in stock and skip vmstat array update unless + * accumulating over a page of vmstat data or when pgdat or idx + * changes. + */ + if (READ_ONCE(stock->cached_objcg) != objcg) { + old = drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + stock->cached_pgdat = pgdat; + } else if (stock->cached_pgdat != pgdat) { + /* Flush the existing cached vmstat data */ + struct pglist_data *oldpg = stock->cached_pgdat; + + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->cached_pgdat = pgdat; + } + + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b + : &stock->nr_slab_unreclaimable_b; + /* + * Even for large object >= PAGE_SIZE, the vmstat data will still be + * cached locally at least once before pushing it out. + */ + if (!*bytes) { + *bytes = nr; + nr = 0; + } else { + *bytes += nr; + if (abs(*bytes) > PAGE_SIZE) { + nr = *bytes; + *bytes = 0; + } else { + nr = 0; + } + } + if (nr) + mod_objcg_mlstate(objcg, pgdat, idx, nr); + + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); +} + +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +{ + struct memcg_stock_pcp *stock; + unsigned long flags; + bool ret = false; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + ret = true; + } + + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; +} + +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +{ + struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); + + if (!old) + return NULL; + + if (stock->nr_bytes) { + unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(old); + + memcg_account_kmem(memcg, -nr_pages); + __refill_stock(memcg, nr_pages); + + css_put(&memcg->css); + } + + /* + * The leftover is flushed to the centralized per-memcg value. + * On the next attempt to refill obj stock it will be moved + * to a per-cpu stock (probably, on an other CPU), see + * refill_obj_stock(). + * + * How often it's flushed is a trade-off between the memory + * limit enforcement accuracy and potential CPU contention, + * so it might be changed in the future. + */ + atomic_add(nr_bytes, &old->nr_charged_bytes); + stock->nr_bytes = 0; + } + + /* + * Flush the vmstat data in current stock + */ + if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(old, stock->cached_pgdat, + NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(old, stock->cached_pgdat, + NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->cached_pgdat = NULL; + } + + WRITE_ONCE(stock->cached_objcg, NULL); + /* + * The `old' objects needs to be released by the caller via + * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. + */ + return old; +} + +static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +{ + struct obj_cgroup *objcg = READ_ONCE(stock->cached_objcg); + struct mem_cgroup *memcg; + + if (objcg) { + memcg = obj_cgroup_memcg(objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } + + return false; +} + +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + bool allow_uncharge) +{ + struct memcg_stock_pcp *stock; + struct obj_cgroup *old = NULL; + unsigned long flags; + unsigned int nr_pages = 0; + + local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ + old = drain_obj_stock(stock); + obj_cgroup_get(objcg); + WRITE_ONCE(stock->cached_objcg, objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + allow_uncharge = true; /* Allow uncharge when objcg changes */ + } + stock->nr_bytes += nr_bytes; + + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { + nr_pages = stock->nr_bytes >> PAGE_SHIFT; + stock->nr_bytes &= (PAGE_SIZE - 1); + } + + local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + if (old) + obj_cgroup_put(old); + + if (nr_pages) + obj_cgroup_uncharge_pages(objcg, nr_pages); +} + +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + unsigned int nr_pages, nr_bytes; + int ret; + + if (consume_obj_stock(objcg, size)) + return 0; + + /* + * In theory, objcg->nr_charged_bytes can have enough + * pre-charged bytes to satisfy the allocation. However, + * flushing objcg->nr_charged_bytes requires two atomic + * operations, and objcg->nr_charged_bytes can't be big. + * The shared objcg->nr_charged_bytes can also become a + * performance bottleneck if all tasks of the same memcg are + * trying to update it. So it's better to ignore it and try + * grab some new pages. The stock's nr_bytes will be flushed to + * objcg->nr_charged_bytes later on when objcg changes. + * + * The stock's nr_bytes may contain enough pre-charged bytes + * to allow one less page from being charged, but we can't rely + * on the pre-charged bytes not being changed outside of + * consume_obj_stock() or refill_obj_stock(). So ignore those + * pre-charged bytes as well when charging pages. To avoid a + * page uncharge right after a page charge, we set the + * allow_uncharge flag to false when calling refill_obj_stock() + * to temporarily allow the pre-charged bytes to exceed the page + * size limit. The maximum reachable value of the pre-charged + * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data + * race. + */ + nr_pages = size >> PAGE_SHIFT; + nr_bytes = size & (PAGE_SIZE - 1); + + if (nr_bytes) + nr_pages += 1; + + ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); + if (!ret && nr_bytes) + refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + + return ret; +} + +void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) +{ + refill_obj_stock(objcg, size, true); +} + +#endif /* CONFIG_MEMCG_KMEM */ + +/* + * Because page_memcg(head) is not set on tails, set it now. + */ +void split_page_memcg(struct page *head, unsigned int nr) +{ + struct folio *folio = page_folio(head); + struct mem_cgroup *memcg = folio_memcg(folio); + int i; + + if (mem_cgroup_disabled() || !memcg) + return; + + for (i = 1; i < nr; i++) + folio_page(folio, i)->memcg_data = folio->memcg_data; + + if (folio_memcg_kmem(folio)) + obj_cgroup_get_many(__folio_objcg(folio), nr - 1); + else + css_get_many(&memcg->css, nr - 1); +} + +#ifdef CONFIG_SWAP +/** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called page_counter_charge() about + * both res and memsw, and called css_get(). + */ +static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) +{ + unsigned short old_id, new_id; + + old_id = mem_cgroup_id(from); + new_id = mem_cgroup_id(to); + + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mod_memcg_state(from, MEMCG_SWAP, -1); + mod_memcg_state(to, MEMCG_SWAP, 1); + return 0; + } + return -EINVAL; +} +#else +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to) +{ + return -EINVAL; +} +#endif + +static DEFINE_MUTEX(memcg_max_mutex); + +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) +{ + bool enlarge = false; + bool drained = false; + int ret; + bool limits_invariant; + struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; + + do { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + mutex_lock(&memcg_max_mutex); + /* + * Make sure that the new limit (memsw or memory limit) doesn't + * break our basic invariant rule memory.max <= memsw.max. + */ + limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) : + max <= memcg->memsw.max; + if (!limits_invariant) { + mutex_unlock(&memcg_max_mutex); + ret = -EINVAL; + break; + } + if (max > counter->max) + enlarge = true; + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); + + if (!ret) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + ret = -EBUSY; + break; + } + } while (true); + + if (!ret && enlarge) + memcg_oom_recover(memcg); + + return ret; +} + +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + gfp_t gfp_mask, + unsigned long *total_scanned) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_node *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_node *mctz; + unsigned long excess; + + if (order > 0) + return 0; + + mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; + + /* + * Do not even bother to check the largest node if the root + * is empty. Do it lockless to prevent lock bouncing. Races + * are acceptable as soft limit is best effort anyway. + */ + if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) + return 0; + + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, + gfp_mask, total_scanned); + nr_reclaimed += reclaimed; + spin_lock_irq(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) + next_mz = __mem_cgroup_largest_soft_limit_node(mctz); + + excess = soft_limit_excess(mz->memcg); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz, mctz, excess); + spin_unlock_irq(&mctz->lock); + css_put(&mz->memcg->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->memcg->css); + return nr_reclaimed; +} + +/* + * Reclaims as many pages from the given memcg as possible. + * + * Caller is responsible for holding css reference for memcg. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +{ + int nr_retries = MAX_RECLAIM_RETRIES; + + /* we call try-to-free pages for make this cgroup empty */ + lru_add_drain_all(); + + drain_all_stock(memcg); + + /* try to free all pages in this cgroup */ + while (nr_retries && page_counter_read(&memcg->memory)) { + if (signal_pending(current)) + return -EINTR; + + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + MEMCG_RECLAIM_MAY_SWAP)) + nr_retries--; + } + + return 0; +} + +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + return mem_cgroup_force_empty(memcg) ?: nbytes; +} + +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return 1; +} + +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + if (val == 1) + return 0; + + pr_warn_once("Non-hierarchical mode is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + + return -EINVAL; +} + +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +{ + unsigned long val; + + if (mem_cgroup_is_root(memcg)) { + mem_cgroup_flush_stats(); + val = memcg_page_state(memcg, NR_FILE_PAGES) + + memcg_page_state(memcg, NR_ANON_MAPPED); + if (swap) + val += memcg_page_state(memcg, MEMCG_SWAP); + } else { + if (!swap) + val = page_counter_read(&memcg->memory); + else + val = page_counter_read(&memcg->memsw); + } + return val; +} + +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, + RES_SOFT_LIMIT, +}; + +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct page_counter *counter; + + switch (MEMFILE_TYPE(cft->private)) { + case _MEM: + counter = &memcg->memory; + break; + case _MEMSWAP: + counter = &memcg->memsw; + break; + case _KMEM: + counter = &memcg->kmem; + break; + case _TCP: + counter = &memcg->tcpmem; + break; + default: + BUG(); + } + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + if (counter == &memcg->memory) + return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; + if (counter == &memcg->memsw) + return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->max * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + case RES_SOFT_LIMIT: + return (u64)memcg->soft_limit * PAGE_SIZE; + default: + BUG(); + } +} + +#ifdef CONFIG_MEMCG_KMEM +static int memcg_online_kmem(struct mem_cgroup *memcg) +{ + struct obj_cgroup *objcg; + + if (mem_cgroup_kmem_disabled()) + return 0; + + if (unlikely(mem_cgroup_is_root(memcg))) + return 0; + + objcg = obj_cgroup_alloc(); + if (!objcg) + return -ENOMEM; + + objcg->memcg = memcg; + rcu_assign_pointer(memcg->objcg, objcg); + + static_branch_enable(&memcg_kmem_enabled_key); + + memcg->kmemcg_id = memcg->id.id; + + return 0; +} + +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent; + + if (mem_cgroup_kmem_disabled()) + return; + + if (unlikely(mem_cgroup_is_root(memcg))) + return; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + memcg_reparent_objcgs(memcg, parent); + + /* + * After we have finished memcg_reparent_objcgs(), all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. + * The ordering is imposed by list_lru_node->lock taken by + * memcg_reparent_list_lrus(). + */ + memcg_reparent_list_lrus(memcg, parent); +} +#else +static int memcg_online_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) +{ + int ret; + + mutex_lock(&memcg_max_mutex); + + ret = page_counter_set_max(&memcg->tcpmem, max); + if (ret) + goto out; + + if (!memcg->tcpmem_active) { + /* + * The active flag needs to be written after the static_key + * update. This is what guarantees that the socket activation + * function is the last one to run. See mem_cgroup_sk_alloc() + * for details, and note that we don't mark any socket as + * belonging to this memcg until that flag is up. + * + * We need to do this, because static_keys will span multiple + * sites, but we can't control their order. If we mark a socket + * as accounted, but the accounting functions are not patched in + * yet, we'll lose accounting. + * + * We never race with the readers in mem_cgroup_sk_alloc(), + * because when this value change, the code to process it is not + * patched in yet. + */ + static_branch_inc(&memcg_sockets_enabled_key); + memcg->tcpmem_active = true; + } +out: + mutex_unlock(&memcg_max_mutex); + return ret; +} + +/* + * The user of this function is... + * RES_LIMIT. + */ +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; + int ret; + + buf = strstrip(buf); + ret = page_counter_memparse(buf, "-1", &nr_pages); + if (ret) + return ret; + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + ret = mem_cgroup_resize_max(memcg, nr_pages, false); + break; + case _MEMSWAP: + ret = mem_cgroup_resize_max(memcg, nr_pages, true); + break; + case _KMEM: + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " + "Writing any value to this file has no effect. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + ret = 0; + break; + case _TCP: + ret = memcg_update_tcp_max(memcg, nr_pages); + break; + } + break; + case RES_SOFT_LIMIT: + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + ret = -EOPNOTSUPP; + } else { + memcg->soft_limit = nr_pages; + ret = 0; + } + break; + } + return ret ?: nbytes; +} + +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct page_counter *counter; + + switch (MEMFILE_TYPE(of_cft(of)->private)) { + case _MEM: + counter = &memcg->memory; + break; + case _MEMSWAP: + counter = &memcg->memsw; + break; + case _KMEM: + counter = &memcg->kmem; + break; + case _TCP: + counter = &memcg->tcpmem; + break; + default: + BUG(); + } + + switch (MEMFILE_ATTR(of_cft(of)->private)) { + case RES_MAX_USAGE: + page_counter_reset_watermark(counter); + break; + case RES_FAILCNT: + counter->failcnt = 0; + break; + default: + BUG(); + } + + return nbytes; +} + +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return mem_cgroup_from_css(css)->move_charge_at_immigrate; +} + +#ifdef CONFIG_MMU +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + + if (val & ~MOVE_MASK) + return -EINVAL; + + /* + * No kind of locking is needed in here, because ->can_attach() will + * check this value once in the beginning of the process, and then carry + * on with stale data. This means that changes to this value will only + * affect task migrations starting after the change. + */ + memcg->move_charge_at_immigrate = val; + return 0; +} +#else +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return -ENOSYS; +} +#endif + +#ifdef CONFIG_NUMA + +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask, bool tree) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + unsigned long nr = 0; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + if (tree) + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + else + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask, + bool tree) +{ + unsigned long nr = 0; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + if (tree) + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + else + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); + } + return nr; +} + +static int memcg_numa_stat_show(struct seq_file *m, void *v) +{ + struct numa_stat { + const char *name; + unsigned int lru_mask; + }; + + static const struct numa_stat stats[] = { + { "total", LRU_ALL }, + { "file", LRU_ALL_FILE }, + { "anon", LRU_ALL_ANON }, + { "unevictable", BIT(LRU_UNEVICTABLE) }, + }; + const struct numa_stat *stat; + int nid; + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + mem_cgroup_flush_stats(); + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + seq_printf(m, "%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + false)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, false)); + seq_putc(m, '\n'); + } + + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + + seq_printf(m, "hierarchical_%s=%lu", stat->name, + mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, + true)); + for_each_node_state(nid, N_MEMORY) + seq_printf(m, " N%d=%lu", nid, + mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask, true)); + seq_putc(m, '\n'); + } + + return 0; +} +#endif /* CONFIG_NUMA */ + +static const unsigned int memcg1_stats[] = { + NR_FILE_PAGES, + NR_ANON_MAPPED, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + NR_ANON_THPS, +#endif + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "rss_huge", +#endif + "shmem", + "mapped_file", + "dirty", + "writeback", + "workingset_refault_anon", + "workingset_refault_file", + "swap", +}; + +/* Universal VM events cgroup1 shows, original sort order */ +static const unsigned int memcg1_events[] = { + PGPGIN, + PGPGOUT, + PGFAULT, + PGMAJFAULT, +}; + +static int memcg_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + unsigned long memory, memsw; + struct mem_cgroup *mi; + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); + + mem_cgroup_flush_stats(); + + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) + continue; + nr = memcg_page_state_local(memcg, memcg1_stats[i]); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); + } + + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]), + memcg_events_local(memcg, memcg1_events[i])); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %lu\n", lru_list_name(i), + memcg_page_state_local(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); + + /* Hierarchical information */ + memory = memsw = PAGE_COUNTER_MAX; + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { + memory = min(memory, READ_ONCE(mi->memory.max)); + memsw = min(memsw, READ_ONCE(mi->memsw.max)); + } + seq_printf(m, "hierarchical_memory_limit %llu\n", + (u64)memory * PAGE_SIZE); + if (do_memsw_account()) + seq_printf(m, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); + + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + unsigned long nr; + + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) + continue; + nr = memcg_page_state(memcg, memcg1_stats[i]); + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); + } + + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "total_%s %llu\n", + vm_event_name(memcg1_events[i]), + (u64)memcg_events(memcg, memcg1_events[i])); + + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "total_%s %llu\n", lru_list_name(i), + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); + +#ifdef CONFIG_DEBUG_VM + { + pg_data_t *pgdat; + struct mem_cgroup_per_node *mz; + unsigned long anon_cost = 0; + unsigned long file_cost = 0; + + for_each_online_pgdat(pgdat) { + mz = memcg->nodeinfo[pgdat->node_id]; + + anon_cost += mz->lruvec.anon_cost; + file_cost += mz->lruvec.file_cost; + } + seq_printf(m, "anon_cost %lu\n", anon_cost); + seq_printf(m, "file_cost %lu\n", file_cost); + } +#endif + + return 0; +} + +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return mem_cgroup_swappiness(memcg); +} + +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > 200) + return -EINVAL; + + if (!mem_cgroup_is_root(memcg)) + memcg->swappiness = val; + else + vm_swappiness = val; + + return 0; +} + +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + unsigned long usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below or equal to usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd, 1); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd, 1); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_memsw_account()) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; +} + +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) +{ + struct mem_cgroup_eventfd_list *ev; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry(ev, &memcg->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + + spin_unlock(&memcg_oom_lock); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + mem_cgroup_oom_notify_cb(iter); +} + +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long threshold; + unsigned long usage; + int i, size, ret; + + ret = page_counter_memparse(args, "-1", &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) + memcpy(new->entries, thresholds->primary->entries, + flex_array_size(new, entries, size - 1)); + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(*new->entries), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold <= usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } else + break; + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) +{ + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + unsigned long usage; + int i, j, size, entries; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) { + thresholds = &memcg->thresholds; + usage = mem_cgroup_usage(memcg, false); + } else if (type == _MEMSWAP) { + thresholds = &memcg->memsw_thresholds; + usage = mem_cgroup_usage(memcg, true); + } else + BUG(); + + if (!thresholds->primary) + goto unlock; + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = entries = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + else + entries++; + } + + new = thresholds->spare; + + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold <= usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } +unlock: + mutex_unlock(&memcg->thresholds_lock); +} + +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup_eventfd_list *event; + + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + spin_lock(&memcg_oom_lock); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (memcg->under_oom) + eventfd_signal(eventfd, 1); + spin_unlock(&memcg_oom_lock); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct mem_cgroup_eventfd_list *ev, *tmp; + + spin_lock(&memcg_oom_lock); + + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + spin_unlock(&memcg_oom_lock); +} + +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); + + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); + seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); + seq_printf(sf, "oom_kill %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); + return 0; +} + +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1))) + return -EINVAL; + + memcg->oom_kill_disable = val; + if (!val) + memcg_oom_recover(memcg); + + return 0; +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +#include + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +/** + * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg + * @wb: bdi_writeback in question + * @pfilepages: out parameter for number of file pages + * @pheadroom: out parameter for number of allocatable pages according to memcg + * @pdirty: out parameter for number of dirty pages + * @pwriteback: out parameter for number of pages under writeback + * + * Determine the numbers of file, headroom, dirty, and writeback pages in + * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom + * is a bit more involved. + * + * A memcg's headroom is "min(max, high) - used". In the hierarchy, the + * headroom is calculated as the lowest headroom of itself and the + * ancestors. Note that this doesn't consider the actual amount of + * available memory in the system. The caller should further cap + * *@pheadroom accordingly. + */ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, + unsigned long *pheadroom, unsigned long *pdirty, + unsigned long *pwriteback) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + struct mem_cgroup *parent; + + mem_cgroup_flush_stats(); + + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); + *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) + + memcg_page_state(memcg, NR_ACTIVE_FILE); + + *pheadroom = PAGE_COUNTER_MAX; + while ((parent = parent_mem_cgroup(memcg))) { + unsigned long ceiling = min(READ_ONCE(memcg->memory.max), + READ_ONCE(memcg->memory.high)); + unsigned long used = page_counter_read(&memcg->memory); + + *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); + memcg = parent; + } +} + +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback. The former + * tracks ownership per-page while the latter per-inode. This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases. For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode. B owns the inode and + * A's memory is limited far below B's. A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback. A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repeatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underlying IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction. However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism. When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg. When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired. Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, + struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = folio_memcg(folio); + struct memcg_cgwb_frn *frn; + u64 now = get_jiffies_64(); + u64 oldest_at = now; + int oldest = -1; + int i; + + trace_track_foreign_dirty(folio, wb); + + /* + * Pick the slot to use. If there is already a slot for @wb, keep + * using it. If not replace the oldest one which isn't being + * written out. + */ + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + frn = &memcg->cgwb_frn[i]; + if (frn->bdi_id == wb->bdi->id && + frn->memcg_id == wb->memcg_css->id) + break; + if (time_before64(frn->at, oldest_at) && + atomic_read(&frn->done.cnt) == 1) { + oldest = i; + oldest_at = frn->at; + } + } + + if (i < MEMCG_CGWB_FRN_CNT) { + /* + * Re-using an existing one. Update timestamp lazily to + * avoid making the cacheline hot. We want them to be + * reasonably up-to-date and significantly shorter than + * dirty_expire_interval as that's what expires the record. + * Use the shorter of 1s and dirty_expire_interval / 8. + */ + unsigned long update_intv = + min_t(unsigned long, HZ, + msecs_to_jiffies(dirty_expire_interval * 10) / 8); + + if (time_before64(frn->at, now - update_intv)) + frn->at = now; + } else if (oldest >= 0) { + /* replace the oldest free one */ + frn = &memcg->cgwb_frn[oldest]; + frn->bdi_id = wb->bdi->id; + frn->memcg_id = wb->memcg_css->id; + frn->at = now; + } +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); + u64 now = jiffies_64; + int i; + + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + + /* + * If the record is older than dirty_expire_interval, + * writeback on it has already started. No need to kick it + * off again. Also, don't start a new one if there's + * already one in flight. + */ + if (time_after64(frn->at, now - intv) && + atomic_read(&frn->done.cnt) == 1) { + frn->at = 0; + trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, + WB_REASON_FOREIGN_FLUSH, + &frn->done); + } + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); +} + +/* + * Gets called on EPOLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, + int sync, void *key) +{ + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + __poll_t flags = key_to_poll(key); + + if (flags & EPOLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + struct dentry *cdentry; + const char *name; + char *endp; + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return -EOPNOTSUPP; + + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buf = endp + 1; + + cfd = simple_strtoul(buf, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buf = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = file_permission(cfile.file, MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cdentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, + &memory_cgrp_subsys); + ret = -EINVAL; + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); + goto out_put_cfile; + } + + ret = event->register_event(memcg, event->eventfd, buf); + if (ret) + goto out_put_css; + + vfs_poll(efile.file, &event->pt); + + spin_lock_irq(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock_irq(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return nbytes; + +out_put_css: + css_put(css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + +#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +static int mem_cgroup_slab_show(struct seq_file *m, void *p) +{ + /* + * Deprecated. + * Please, take a look at tools/cgroup/memcg_slabinfo.py . + */ + return 0; +} +#endif + +static struct cftype mem_cgroup_legacy_files[] = { + { + .name = "usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "failcnt", + .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "stat", + .seq_show = memcg_stat_show, + }, + { + .name = "force_empty", + .write = mem_cgroup_force_empty_write, + }, + { + .name = "use_hierarchy", + .write_u64 = mem_cgroup_hierarchy_write, + .read_u64 = mem_cgroup_hierarchy_read, + }, + { + .name = "cgroup.event_control", /* XXX: for compat */ + .write = memcg_write_event_control, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, + }, + { + .name = "swappiness", + .read_u64 = mem_cgroup_swappiness_read, + .write_u64 = mem_cgroup_swappiness_write, + }, + { + .name = "move_charge_at_immigrate", + .read_u64 = mem_cgroup_move_charge_read, + .write_u64 = mem_cgroup_move_charge_write, + }, + { + .name = "oom_control", + .seq_show = mem_cgroup_oom_control_read, + .write_u64 = mem_cgroup_oom_control_write, + }, + { + .name = "pressure_level", + }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .seq_show = memcg_numa_stat_show, + }, +#endif + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.failcnt", + .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, +#if defined(CONFIG_MEMCG_KMEM) && \ + (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) + { + .name = "kmem.slabinfo", + .seq_show = mem_cgroup_slab_show, + }, +#endif + { + .name = "kmem.tcp.limit_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.failcnt", + .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { }, /* terminate */ +}; + +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the offline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_remove(struct mem_cgroup *memcg) +{ + if (memcg->id.id > 0) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + } +} + +static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg, + unsigned int n) +{ + refcount_add(n, &memcg->id.ref); +} + +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) +{ + if (refcount_sub_and_test(n, &memcg->id.ref)) { + mem_cgroup_id_remove(memcg); + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + +#ifdef CONFIG_SHRINKER_DEBUG +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + cgrp = cgroup_get_from_id(ino); + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); + if (css) + memcg = container_of(css, struct mem_cgroup, css); + else + memcg = ERR_PTR(-ENOENT); + + cgroup_put(cgrp); + + return memcg; +} +#endif + +static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) +{ + struct mem_cgroup_per_node *pn; + + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node); + if (!pn) + return 1; + + pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu, + GFP_KERNEL_ACCOUNT); + if (!pn->lruvec_stats_percpu) { + kfree(pn); + return 1; + } + + lruvec_init(&pn->lruvec); + pn->memcg = memcg; + + memcg->nodeinfo[node] = pn; + return 0; +} + +static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) +{ + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + + if (!pn) + return; + + free_percpu(pn->lruvec_stats_percpu); + kfree(pn); +} + +static void __mem_cgroup_free(struct mem_cgroup *memcg) +{ + int node; + + for_each_node(node) + free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); + free_percpu(memcg->vmstats_percpu); + kfree(memcg); +} + +static void mem_cgroup_free(struct mem_cgroup *memcg) +{ + lru_gen_exit_memcg(memcg); + memcg_wb_domain_exit(memcg); + __mem_cgroup_free(memcg); +} + +static struct mem_cgroup *mem_cgroup_alloc(void) +{ + struct mem_cgroup *memcg; + int node; + int __maybe_unused i; + long error = -ENOMEM; + + memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); + if (!memcg) + return ERR_PTR(error); + + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); + if (memcg->id.id < 0) { + error = memcg->id.id; + goto fail; + } + + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, + GFP_KERNEL_ACCOUNT); + if (!memcg->vmstats_percpu) + goto fail; + + for_each_node(node) + if (alloc_mem_cgroup_per_node_info(memcg, node)) + goto fail; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; + + INIT_WORK(&memcg->high_work, high_work_func); + INIT_LIST_HEAD(&memcg->oom_notify); + mutex_init(&memcg->thresholds_lock); + spin_lock_init(&memcg->move_lock); + vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); + memcg->socket_pressure = jiffies; +#ifdef CONFIG_MEMCG_KMEM + memcg->kmemcg_id = -1; + INIT_LIST_HEAD(&memcg->objcg_list); +#endif +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + memcg->cgwb_frn[i].done = + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); + INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); + memcg->deferred_split_queue.split_queue_len = 0; +#endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + lru_gen_init_memcg(memcg); + return memcg; +fail: + mem_cgroup_id_remove(memcg); + __mem_cgroup_free(memcg); + return ERR_PTR(error); +} + +static struct cgroup_subsys_state * __ref +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); + struct mem_cgroup *memcg, *old_memcg; + + old_memcg = set_active_memcg(parent); + memcg = mem_cgroup_alloc(); + set_active_memcg(old_memcg); + if (IS_ERR(memcg)) + return ERR_CAST(memcg); + + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); + memcg->soft_limit = PAGE_COUNTER_MAX; +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + memcg->zswap_max = PAGE_COUNTER_MAX; +#endif + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); + if (parent) { + memcg->swappiness = mem_cgroup_swappiness(parent); + memcg->oom_kill_disable = parent->oom_kill_disable; + + page_counter_init(&memcg->memory, &parent->memory); + page_counter_init(&memcg->swap, &parent->swap); + page_counter_init(&memcg->kmem, &parent->kmem); + page_counter_init(&memcg->tcpmem, &parent->tcpmem); + } else { + init_memcg_events(); + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); + + root_mem_cgroup = memcg; + return &memcg->css; + } + + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_inc(&memcg_sockets_enabled_key); + + return &memcg->css; +} + +static int mem_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (memcg_online_kmem(memcg)) + goto remove_id; + + /* + * A memcg must be visible for expand_shrinker_info() + * by the time the maps are allocated. So, we allocate maps + * here, when for_each_mem_cgroup() can't skip it. + */ + if (alloc_shrinker_info(memcg)) + goto offline_kmem; + + /* Online state pins memcg ID, memcg ID pins CSS */ + refcount_set(&memcg->id.ref, 1); + css_get(css); + + if (unlikely(mem_cgroup_is_root(memcg))) + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, + 2UL*HZ); + return 0; +offline_kmem: + memcg_offline_kmem(memcg); +remove_id: + mem_cgroup_id_remove(memcg); + return -ENOMEM; +} + +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock_irq(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock_irq(&memcg->event_list_lock); + + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); + + memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); + wb_memcg_offline(memcg); + + drain_all_stock(memcg); + + mem_cgroup_id_put(memcg); +} + +static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + invalidate_reclaim_iterators(memcg); +} + +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + int __maybe_unused i; + +#ifdef CONFIG_CGROUP_WRITEBACK + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + wb_wait_for_completion(&memcg->cgwb_frn[i].done); +#endif + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_dec(&memcg_sockets_enabled_key); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) + static_branch_dec(&memcg_sockets_enabled_key); + + vmpressure_cleanup(&memcg->vmpressure); + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); + free_shrinker_info(memcg); + mem_cgroup_free(memcg); +} + +/** + * mem_cgroup_css_reset - reset the states of a mem_cgroup + * @css: the target css + * + * Reset the states of the mem_cgroup associated with @css. This is + * invoked when the userland requests disabling on the default hierarchy + * but the memcg is pinned through dependency. The memcg should stop + * applying policies and should revert to the vanilla state as it may be + * made visible again. + * + * The current implementation only resets the essential configurations. + * This needs to be expanded to cover all the visible parts. + */ +static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); + memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); + memcg_wb_domain_size_changed(memcg); +} + +static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct memcg_vmstats_percpu *statc; + long delta, v; + int i, nid; + + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); + + for (i = 0; i < MEMCG_NR_STAT; i++) { + /* + * Collect the aggregated propagation counts of groups + * below us. We're in a per-cpu loop here and this is + * a global counter, so the first cycle will get them. + */ + delta = memcg->vmstats->state_pending[i]; + if (delta) + memcg->vmstats->state_pending[i] = 0; + + /* Add CPU changes on this level since the last flush */ + v = READ_ONCE(statc->state[i]); + if (v != statc->state_prev[i]) { + delta += v - statc->state_prev[i]; + statc->state_prev[i] = v; + } + + if (!delta) + continue; + + /* Aggregate counts on this level and propagate upwards */ + memcg->vmstats->state[i] += delta; + if (parent) + parent->vmstats->state_pending[i] += delta; + } + + for (i = 0; i < NR_MEMCG_EVENTS; i++) { + delta = memcg->vmstats->events_pending[i]; + if (delta) + memcg->vmstats->events_pending[i] = 0; + + v = READ_ONCE(statc->events[i]); + if (v != statc->events_prev[i]) { + delta += v - statc->events_prev[i]; + statc->events_prev[i] = v; + } + + if (!delta) + continue; + + memcg->vmstats->events[i] += delta; + if (parent) + parent->vmstats->events_pending[i] += delta; + } + + for_each_node_state(nid, N_MEMORY) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; + struct mem_cgroup_per_node *ppn = NULL; + struct lruvec_stats_percpu *lstatc; + + if (parent) + ppn = parent->nodeinfo[nid]; + + lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + delta = pn->lruvec_stats.state_pending[i]; + if (delta) + pn->lruvec_stats.state_pending[i] = 0; + + v = READ_ONCE(lstatc->state[i]); + if (v != lstatc->state_prev[i]) { + delta += v - lstatc->state_prev[i]; + lstatc->state_prev[i] = v; + } + + if (!delta) + continue; + + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } + } +} + +#ifdef CONFIG_MMU +/* Handlers for move charge at task migration. */ +static int mem_cgroup_do_precharge(unsigned long count) +{ + int ret; + + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); + if (!ret) { + mc.precharge += count; + return ret; + } + + /* Try charges one by one with reclaim, but do not retry */ + while (count--) { + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); + if (ret) + return ret; + mc.precharge++; + cond_resched(); + } + return 0; +} + +union mc_target { + struct page *page; + swp_entry_t ent; +}; + +enum mc_target_type { + MC_TARGET_NONE = 0, + MC_TARGET_PAGE, + MC_TARGET_SWAP, + MC_TARGET_DEVICE, +}; + +static struct page *mc_handle_present_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) +{ + struct page *page = vm_normal_page(vma, addr, ptent); + + if (!page || !page_mapped(page)) + return NULL; + if (PageAnon(page)) { + if (!(mc.flags & MOVE_ANON)) + return NULL; + } else { + if (!(mc.flags & MOVE_FILE)) + return NULL; + } + if (!get_page_unless_zero(page)) + return NULL; + + return page; +} + +#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + swp_entry_t ent = pte_to_swp_entry(ptent); + + if (!(mc.flags & MOVE_ANON)) + return NULL; + + /* + * Handle device private pages that are not accessible by the CPU, but + * stored as special swap entries in the page table. + */ + if (is_device_private_entry(ent)) { + page = pfn_swap_entry_to_page(ent); + if (!get_page_unless_zero(page)) + return NULL; + return page; + } + + if (non_swap_entry(ent)) + return NULL; + + /* + * Because swap_cache_get_folio() updates some statistics counter, + * we call find_get_page() with swapper_space directly. + */ + page = find_get_page(swap_address_space(ent), swp_offset(ent)); + entry->val = ent.val; + + return page; +} +#else +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + pte_t ptent, swp_entry_t *entry) +{ + return NULL; +} +#endif + +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) +{ + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!(mc.flags & MOVE_FILE)) + return NULL; + + /* page is moved even if it's not RSS of this task(page-faulted). */ + /* shmem/tmpfs may report page out on swap: account for that too. */ + return find_get_incore_page(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); +} + +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @compound: charge the page as compound or small page + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must make sure the page is not on LRU (isolate_page() is useful.) + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + bool compound, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + struct folio *folio = page_folio(page); + struct lruvec *from_vec, *to_vec; + struct pglist_data *pgdat; + unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; + int nid, ret; + + VM_BUG_ON(from == to); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON(compound && !folio_test_large(folio)); + + /* + * Prevent mem_cgroup_migrate() from looking at + * page's memory cgroup of its source page while we change it. + */ + ret = -EBUSY; + if (!folio_trylock(folio)) + goto out; + + ret = -EINVAL; + if (folio_memcg(folio) != from) + goto out_unlock; + + pgdat = folio_pgdat(folio); + from_vec = mem_cgroup_lruvec(from, pgdat); + to_vec = mem_cgroup_lruvec(to, pgdat); + + folio_memcg_lock(folio); + + if (folio_test_anon(folio)) { + if (folio_mapped(folio)) { + __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); + if (folio_test_transhuge(folio)) { + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); + } + } + } else { + __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); + + if (folio_test_swapbacked(folio)) { + __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); + __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); + } + + if (folio_mapped(folio)) { + __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); + } + + if (folio_test_dirty(folio)) { + struct address_space *mapping = folio_mapping(folio); + + if (mapping_can_writeback(mapping)) { + __mod_lruvec_state(from_vec, NR_FILE_DIRTY, + -nr_pages); + __mod_lruvec_state(to_vec, NR_FILE_DIRTY, + nr_pages); + } + } + } + + if (folio_test_writeback(folio)) { + __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); + __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); + } + + /* + * All state has been migrated, let's switch to the new memcg. + * + * It is safe to change page's memcg here because the page + * is referenced, charged, isolated, and locked: we can't race + * with (un)charging, migration, LRU putback, or anything else + * that would rely on a stable page's memory cgroup. + * + * Note that lock_page_memcg is a memcg lock, not a page lock, + * to save space. As soon as we switch page's memory cgroup to a + * new memcg that isn't locked, the above state can change + * concurrently again. Make sure we're truly done with it. + */ + smp_mb(); + + css_get(&to->css); + css_put(&from->css); + + folio->memcg_data = (unsigned long)to; + + __folio_memcg_unlock(from); + + ret = 0; + nid = folio_nid(folio); + + local_irq_disable(); + mem_cgroup_charge_statistics(to, nr_pages); + memcg_check_events(to, nid); + mem_cgroup_charge_statistics(from, -nr_pages); + memcg_check_events(from, nid); + local_irq_enable(); +out_unlock: + folio_unlock(folio); +out: + return ret; +} + +/** + * get_mctgt_type - get target type of moving charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. + * For now we such page is charge like a regular page would be as for all + * intent and purposes it is just special memory taking the place of a + * regular page. + * + * See Documentations/vm/hmm.txt and include/linux/hmm.h + * + * Called with pte lock held. + */ + +static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, union mc_target *target) +{ + struct page *page = NULL; + enum mc_target_type ret = MC_TARGET_NONE; + swp_entry_t ent = { .val = 0 }; + + if (pte_present(ptent)) + page = mc_handle_present_pte(vma, addr, ptent); + else if (pte_none_mostly(ptent)) + /* + * PTE markers should be treated as a none pte here, separated + * from other swap handling below. + */ + page = mc_handle_file_pte(vma, addr, ptent); + else if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, ptent, &ent); + + if (!page && !ent.val) + return ret; + if (page) { + /* + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the page is valid or + * not under LRU exclusion. + */ + if (page_memcg(page) == mc.from) { + ret = MC_TARGET_PAGE; + if (is_device_private_page(page) || + is_device_coherent_page(page)) + ret = MC_TARGET_DEVICE; + if (target) + target->page = page; + } + if (!ret || !target) + put_page(page); + } + /* + * There is a swap entry and a page doesn't exist or isn't charged. + * But we cannot move a tail-page in a THP. + */ + if (ent.val && !ret && (!page || !PageTransCompound(page)) && + mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; + } + return ret; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * We don't consider PMD mapped swapping or file mapped pages because THP does + * not support them for now. + * Caller should make sure that pmd_trans_huge(pmd) is true. + */ +static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + struct page *page = NULL; + enum mc_target_type ret = MC_TARGET_NONE; + + if (unlikely(is_swap_pmd(pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(pmd)); + return ret; + } + page = pmd_page(pmd); + VM_BUG_ON_PAGE(!page || !PageHead(page), page); + if (!(mc.flags & MOVE_ANON)) + return ret; + if (page_memcg(page) == mc.from) { + ret = MC_TARGET_PAGE; + if (target) { + get_page(page); + target->page = page; + } + } + return ret; +} +#else +static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, union mc_target *target) +{ + return MC_TARGET_NONE; +} +#endif + +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + pte_t *pte; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + /* + * Note their can not be MC_TARGET_DEVICE for now as we do not + * support transparent huge page with MEMORY_DEVICE_PRIVATE but + * this might change. + */ + if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) + mc.precharge += HPAGE_PMD_NR; + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + if (get_mctgt_type(vma, addr, *pte, NULL)) + mc.precharge++; /* increment precharge temporarily */ + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static const struct mm_walk_ops precharge_walk_ops = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, +}; + +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) +{ + unsigned long precharge; + + mmap_read_lock(mm); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); + mmap_read_unlock(mm); + + precharge = mc.precharge; + mc.precharge = 0; + + return precharge; +} + +static int mem_cgroup_precharge_mc(struct mm_struct *mm) +{ + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); +} + +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + struct mem_cgroup *to = mc.to; + + /* we must uncharge all the leftover precharges from mc.to */ + if (mc.precharge) { + cancel_charge(mc.to, mc.precharge); + mc.precharge = 0; + } + /* + * we didn't uncharge from mc.from at mem_cgroup_move_account(), so + * we must uncharge here. + */ + if (mc.moved_charge) { + cancel_charge(mc.from, mc.moved_charge); + mc.moved_charge = 0; + } + /* we must fixup refcnts and charges */ + if (mc.moved_swap) { + /* uncharge swap account from the old cgroup */ + if (!mem_cgroup_is_root(mc.from)) + page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + + /* + * we charged both to->memory and to->memsw, so we + * should uncharge to->memory. + */ + if (!mem_cgroup_is_root(mc.to)) + page_counter_uncharge(&mc.to->memory, mc.moved_swap); + + mc.moved_swap = 0; + } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mm_struct *mm = mc.mm; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); + spin_lock(&mc.lock); + mc.from = NULL; + mc.to = NULL; + mc.mm = NULL; + spin_unlock(&mc.lock); + + mmput(mm); +} + +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ + struct mem_cgroup *from; + struct task_struct *leader, *p; + struct mm_struct *mm; + unsigned long move_flags; + int ret = 0; + + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return 0; + + /* + * Multi-process migrations only happen on the default hierarchy + * where charge immigration is not used. Perform charge + * immigration if @tset contains a leader and whine if there are + * multiple. + */ + p = NULL; + cgroup_taskset_for_each_leader(leader, css, tset) { + WARN_ON_ONCE(p); + p = leader; + memcg = mem_cgroup_from_css(css); + } + if (!p) + return 0; + + /* + * We are now committed to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + + from = mem_cgroup_from_task(p); + + VM_BUG_ON(from == memcg); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + + spin_lock(&mc.lock); + mc.mm = mm; + mc.from = from; + mc.to = memcg; + mc.flags = move_flags; + spin_unlock(&mc.lock); + /* We set mc.moving_task later */ + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); + } else { + mmput(mm); + } + return ret; +} + +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ + if (mc.to) + mem_cgroup_clear_mc(); +} + +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + int ret = 0; + struct vm_area_struct *vma = walk->vma; + pte_t *pte; + spinlock_t *ptl; + enum mc_target_type target_type; + union mc_target target; + struct page *page; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (mc.precharge < HPAGE_PMD_NR) { + spin_unlock(ptl); + return 0; + } + target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); + if (target_type == MC_TARGET_PAGE) { + page = target.page; + if (!isolate_lru_page(page)) { + if (!mem_cgroup_move_account(page, true, + mc.from, mc.to)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + putback_lru_page(page); + } + put_page(page); + } else if (target_type == MC_TARGET_DEVICE) { + page = target.page; + if (!mem_cgroup_move_account(page, true, + mc.from, mc.to)) { + mc.precharge -= HPAGE_PMD_NR; + mc.moved_charge += HPAGE_PMD_NR; + } + put_page(page); + } + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_unstable(pmd)) + return 0; +retry: + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; addr += PAGE_SIZE) { + pte_t ptent = *(pte++); + bool device = false; + swp_entry_t ent; + + if (!mc.precharge) + break; + + switch (get_mctgt_type(vma, addr, ptent, &target)) { + case MC_TARGET_DEVICE: + device = true; + fallthrough; + case MC_TARGET_PAGE: + page = target.page; + /* + * We can have a part of the split pmd here. Moving it + * can be done but it would be too convoluted so simply + * ignore such a partial THP and keep it in original + * memcg. There should be somebody mapping the head. + */ + if (PageTransCompound(page)) + goto put; + if (!device && isolate_lru_page(page)) + goto put; + if (!mem_cgroup_move_account(page, false, + mc.from, mc.to)) { + mc.precharge--; + /* we uncharge from mc.from later. */ + mc.moved_charge++; + } + if (!device) + putback_lru_page(page); +put: /* get_mctgt_type() gets the page */ + put_page(page); + break; + case MC_TARGET_SWAP: + ent = target.ent; + if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { + mc.precharge--; + mem_cgroup_id_get_many(mc.to, 1); + /* we fixup other refcnts and charges later. */ + mc.moved_swap++; + } + break; + default: + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + if (addr != end) { + /* + * We have consumed all precharges we got in can_attach(). + * We try charge one by one, but don't do any additional + * charges to mc.to if we have failed in charge once in attach() + * phase. + */ + ret = mem_cgroup_do_precharge(1); + if (!ret) + goto retry; + } + + return ret; +} + +static const struct mm_walk_ops charge_walk_ops = { + .pmd_entry = mem_cgroup_move_charge_pte_range, +}; + +static void mem_cgroup_move_charge(void) +{ + lru_add_drain_all(); + /* + * Signal lock_page_memcg() to take the memcg's move_lock + * while we're moving its pages to another memcg. Then wait + * for already started RCU-only updates to finish. + */ + atomic_inc(&mc.from->moving_account); + synchronize_rcu(); +retry: + if (unlikely(!mmap_read_trylock(mc.mm))) { + /* + * Someone who are holding the mmap_lock might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } + /* + * When we have consumed all precharges and failed in doing + * additional charge, the page walk just aborts. + */ + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); + mmap_read_unlock(mc.mm); + atomic_dec(&mc.from->moving_account); +} + +static void mem_cgroup_move_task(void) +{ + if (mc.to) { + mem_cgroup_move_charge(); + mem_cgroup_clear_mc(); + } +} +#else /* !CONFIG_MMU */ +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) +{ + return 0; +} +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) +{ +} +static void mem_cgroup_move_task(void) +{ +} +#endif + +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + /* find the first leader if there is any */ + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && READ_ONCE(task->mm->owner) == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + +static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) +{ + if (value == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); + + return 0; +} + +static u64 memory_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; +} + +static u64 memory_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)memcg->memory.watermark * PAGE_SIZE; +} + +static int memory_min_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + +static int memory_low_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); +} + +static ssize_t memory_low_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long low; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &low); + if (err) + return err; + + page_counter_set_low(&memcg->memory, low); + + return nbytes; +} + +static int memory_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); +} + +static ssize_t memory_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + bool drained = false; + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->memory, high); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long reclaimed; + + if (nr_pages <= high) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + + if (!reclaimed && !nr_retries--) + break; + } + + memcg_wb_domain_size_changed(memcg); + return nbytes; +} + +static int memory_max_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); +} + +static ssize_t memory_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MAX_RECLAIM_RETRIES; + bool drained = false; + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + xchg(&memcg->memory.max, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + nr_reclaims--; + continue; + } + + memcg_memory_event(memcg, MEMCG_OOM); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } + + memcg_wb_domain_size_changed(memcg); + return nbytes; +} + +static void __memory_events_show(struct seq_file *m, atomic_long_t *events) +{ + seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); + seq_printf(m, "oom_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_KILL])); + seq_printf(m, "oom_group_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); +} + +static int memory_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memory_events_show(m, memcg->memory_events); + return 0; +} + +static int memory_events_local_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + __memory_events_show(m, memcg->memory_events_local); + return 0; +} + +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + + if (!buf) + return -ENOMEM; + memory_stat_format(memcg, buf, PAGE_SIZE); + seq_puts(m, buf); + kfree(buf); + return 0; +} + +#ifdef CONFIG_NUMA +static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, + int item) +{ + return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); +} + +static int memory_numa_stat_show(struct seq_file *m, void *v) +{ + int i; + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + mem_cgroup_flush_stats(); + + for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { + int nid; + + if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS) + continue; + + seq_printf(m, "%s", memory_stats[i].name); + for_each_node_state(nid, N_MEMORY) { + u64 size; + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); + size = lruvec_page_state_output(lruvec, + memory_stats[i].idx); + seq_printf(m, " N%d=%llu", nid, size); + } + seq_putc(m, '\n'); + } + + return 0; +} +#endif + +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", memcg->oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, oom_group; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_group); + if (ret) + return ret; + + if (oom_group != 0 && oom_group != 1) + return -EINVAL; + + memcg->oom_group = oom_group; + + return nbytes; +} + +static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int reclaim_options; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + while (nr_reclaimed < nr_to_reclaim) { + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages for + * try_to_free_mem_cgroup_pages(). + */ + if (!nr_retries) + lru_add_drain_all(); + + reclaimed = try_to_free_mem_cgroup_pages(memcg, + nr_to_reclaim - nr_reclaimed, + GFP_KERNEL, reclaim_options); + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return nbytes; +} + +static struct cftype memory_files[] = { + { + .name = "current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = memory_current_read, + }, + { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = memory_peak_read, + }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_low_show, + .write = memory_low_write, + }, + { + .name = "high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_high_show, + .write = memory_high_write, + }, + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_max_show, + .write = memory_max_write, + }, + { + .name = "events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), + .seq_show = memory_events_show, + }, + { + .name = "events.local", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_local_file), + .seq_show = memory_events_local_show, + }, + { + .name = "stat", + .seq_show = memory_stat_show, + }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .seq_show = memory_numa_stat_show, + }, +#endif + { + .name = "oom.group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, + { + .name = "reclaim", + .flags = CFTYPE_NS_DELEGATABLE, + .write = memory_reclaim, + }, + { } /* terminate */ +}; + +struct cgroup_subsys memory_cgrp_subsys = { + .css_alloc = mem_cgroup_css_alloc, + .css_online = mem_cgroup_css_online, + .css_offline = mem_cgroup_css_offline, + .css_released = mem_cgroup_css_released, + .css_free = mem_cgroup_css_free, + .css_reset = mem_cgroup_css_reset, + .css_rstat_flush = mem_cgroup_css_rstat_flush, + .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, + .cancel_attach = mem_cgroup_cancel_attach, + .post_attach = mem_cgroup_move_task, + .dfl_cftypes = memory_files, + .legacy_cftypes = mem_cgroup_legacy_files, + .early_init = 0, +}; + +/* + * This function calculates an individual cgroup's effective + * protection which is derived from its own memory.min/low, its + * parent's and siblings' settings, as well as the actual memory + * distribution in the tree. + * + * The following rules apply to the effective protection values: + * + * 1. At the first level of reclaim, effective protection is equal to + * the declared protection in memory.min and memory.low. + * + * 2. To enable safe delegation of the protection configuration, at + * subsequent levels the effective protection is capped to the + * parent's effective protection. + * + * 3. To make complex and dynamic subtrees easier to configure, the + * user is allowed to overcommit the declared protection at a given + * level. If that is the case, the parent's effective protection is + * distributed to the children in proportion to how much protection + * they have declared and how much of it they are utilizing. + * + * This makes distribution proportional, but also work-conserving: + * if one cgroup claims much more protection than it uses memory, + * the unused remainder is available to its siblings. + * + * 4. Conversely, when the declared protection is undercommitted at a + * given level, the distribution of the larger parental protection + * budget is NOT proportional. A cgroup's protection from a sibling + * is capped to its own memory.min/low setting. + * + * 5. However, to allow protecting recursive subtrees from each other + * without having to declare each individual cgroup's fixed share + * of the ancestor's claim to protection, any unutilized - + * "floating" - protection from up the tree is distributed in + * proportion to each cgroup's *usage*. This makes the protection + * neutral wrt sibling cgroups and lets them compete freely over + * the shared parental protection budget, but it protects the + * subtree as a whole from neighboring subtrees. + * + * Note that 4. and 5. are not in conflict: 4. is about protecting + * against immediate siblings whereas 5. is about protecting against + * neighboring subtrees. + */ +static unsigned long effective_protection(unsigned long usage, + unsigned long parent_usage, + unsigned long setting, + unsigned long parent_effective, + unsigned long siblings_protected) +{ + unsigned long protected; + unsigned long ep; + + protected = min(usage, setting); + /* + * If all cgroups at this level combined claim and use more + * protection then what the parent affords them, distribute + * shares in proportion to utilization. + * + * We are using actual utilization rather than the statically + * claimed protection in order to be work-conserving: claimed + * but unused protection is available to siblings that would + * otherwise get a smaller chunk than what they claimed. + */ + if (siblings_protected > parent_effective) + return protected * parent_effective / siblings_protected; + + /* + * Ok, utilized protection of all children is within what the + * parent affords them, so we know whatever this child claims + * and utilizes is effectively protected. + * + * If there is unprotected usage beyond this value, reclaim + * will apply pressure in proportion to that amount. + * + * If there is unutilized protection, the cgroup will be fully + * shielded from reclaim, but we do return a smaller value for + * protection than what the group could enjoy in theory. This + * is okay. With the overcommit distribution above, effective + * protection is always dependent on how memory is actually + * consumed among the siblings anyway. + */ + ep = protected; + + /* + * If the children aren't claiming (all of) the protection + * afforded to them by the parent, distribute the remainder in + * proportion to the (unprotected) memory of each cgroup. That + * way, cgroups that aren't explicitly prioritized wrt each + * other compete freely over the allowance, but they are + * collectively protected from neighboring trees. + * + * We're using unprotected memory for the weight so that if + * some cgroups DO claim explicit protection, we don't protect + * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. + */ + if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) + return ep; + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { + unsigned long unclaimed; + + unclaimed = parent_effective - siblings_protected; + unclaimed *= usage - protected; + unclaimed /= parent_usage - siblings_protected; + + ep += unclaimed; + } + + return ep; +} + +/** + * mem_cgroup_calculate_protection - check if memory consumption is in the normal range + * @root: the top ancestor of the sub-tree being checked + * @memcg: the memory cgroup to check + * + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. + */ +void mem_cgroup_calculate_protection(struct mem_cgroup *root, + struct mem_cgroup *memcg) +{ + unsigned long usage, parent_usage; + struct mem_cgroup *parent; + + if (mem_cgroup_disabled()) + return; + + if (!root) + root = root_mem_cgroup; + + /* + * Effective values of the reclaim targets are ignored so they + * can be stale. Have a look at mem_cgroup_protection for more + * details. + * TODO: calculation should be more robust so that we do not need + * that special casing. + */ + if (memcg == root) + return; + + usage = page_counter_read(&memcg->memory); + if (!usage) + return; + + parent = parent_mem_cgroup(memcg); + + if (parent == root) { + memcg->memory.emin = READ_ONCE(memcg->memory.min); + memcg->memory.elow = READ_ONCE(memcg->memory.low); + return; + } + + parent_usage = page_counter_read(&parent->memory); + + WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage, + READ_ONCE(memcg->memory.min), + READ_ONCE(parent->memory.emin), + atomic_long_read(&parent->memory.children_min_usage))); + + WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, + READ_ONCE(memcg->memory.low), + READ_ONCE(parent->memory.elow), + atomic_long_read(&parent->memory.children_low_usage))); +} + +static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, + gfp_t gfp) +{ + long nr_pages = folio_nr_pages(folio); + int ret; + + ret = try_charge(memcg, gfp, nr_pages); + if (ret) + goto out; + + css_get(&memcg->css); + commit_charge(folio, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(folio)); + local_irq_enable(); +out: + return ret; +} + +int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(mm); + ret = charge_memcg(folio, memcg, gfp); + css_put(&memcg->css); + + return ret; +} + +/** + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. + * @mm: mm context of the victim + * @gfp: reclaim mode + * @entry: swap entry for which the folio is allocated + * + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, + gfp_t gfp, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + int ret; + + if (mem_cgroup_disabled()) + return 0; + + id = lookup_swap_cgroup_id(entry); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (!memcg || !css_tryget_online(&memcg->css)) + memcg = get_mem_cgroup_from_mm(mm); + rcu_read_unlock(); + + ret = charge_memcg(folio, memcg, gfp); + + css_put(&memcg->css); + return ret; +} + +/* + * mem_cgroup_swapin_uncharge_swap - uncharge swap slot + * @entry: swap entry for which the page is charged + * + * Call this function after successfully adding the charged page to swapcache. + * + * Note: This function assumes the page for which swap slot is being uncharged + * is order 0 page. + */ +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +{ + /* + * Cgroup1's unified memory+swap counter has been charged with the + * new swapcache page, finish the transfer by uncharging the swap + * slot. The swap slot would also get uncharged when it dies, but + * it can stick around indefinitely and we'd count the page twice + * the entire time. + * + * Cgroup2 has separate resource counters for memory and swap, + * so this is a non-issue here. Memory and swap charge lifetimes + * correspond 1:1 to page and swap slot lifetimes: we charge the + * page to memory here, and uncharge swap when the slot is freed. + */ + if (!mem_cgroup_disabled() && do_memsw_account()) { + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry, 1); + } +} + +struct uncharge_gather { + struct mem_cgroup *memcg; + unsigned long nr_memory; + unsigned long pgpgout; + unsigned long nr_kmem; + int nid; +}; + +static inline void uncharge_gather_clear(struct uncharge_gather *ug) +{ + memset(ug, 0, sizeof(*ug)); +} + +static void uncharge_batch(const struct uncharge_gather *ug) +{ + unsigned long flags; + + if (ug->nr_memory) { + page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); + if (do_memsw_account()) + page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + if (ug->nr_kmem) + memcg_account_kmem(ug->memcg, -ug->nr_kmem); + memcg_oom_recover(ug->memcg); + } + + local_irq_save(flags); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); + memcg_check_events(ug->memcg, ug->nid); + local_irq_restore(flags); + + /* drop reference from uncharge_folio */ + css_put(&ug->memcg->css); +} + +static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) +{ + long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + /* + * Nobody should be changing or seriously looking at + * folio memcg or objcg at this point, we have fully + * exclusive access to the folio. + */ + if (folio_memcg_kmem(folio)) { + objcg = __folio_objcg(folio); + /* + * This get matches the put at the end of the function and + * kmem pages do not hold memcg references anymore. + */ + memcg = get_mem_cgroup_from_objcg(objcg); + } else { + memcg = __folio_memcg(folio); + } + + if (!memcg) + return; + + if (ug->memcg != memcg) { + if (ug->memcg) { + uncharge_batch(ug); + uncharge_gather_clear(ug); + } + ug->memcg = memcg; + ug->nid = folio_nid(folio); + + /* pairs with css_put in uncharge_batch */ + css_get(&memcg->css); + } + + nr_pages = folio_nr_pages(folio); + + if (folio_memcg_kmem(folio)) { + ug->nr_memory += nr_pages; + ug->nr_kmem += nr_pages; + + folio->memcg_data = 0; + obj_cgroup_put(objcg); + } else { + /* LRU pages aren't accounted at the root level */ + if (!mem_cgroup_is_root(memcg)) + ug->nr_memory += nr_pages; + ug->pgpgout++; + + folio->memcg_data = 0; + } + + css_put(&memcg->css); +} + +void __mem_cgroup_uncharge(struct folio *folio) +{ + struct uncharge_gather ug; + + /* Don't touch folio->lru of any random page, pre-check: */ + if (!folio_memcg(folio)) + return; + + uncharge_gather_clear(&ug); + uncharge_folio(folio, &ug); + uncharge_batch(&ug); +} + +/** + * __mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * __mem_cgroup_charge(). + */ +void __mem_cgroup_uncharge_list(struct list_head *page_list) +{ + struct uncharge_gather ug; + struct folio *folio; + + uncharge_gather_clear(&ug); + list_for_each_entry(folio, page_list, lru) + uncharge_folio(folio, &ug); + if (ug.memcg) + uncharge_batch(&ug); +} + +/** + * mem_cgroup_migrate - Charge a folio's replacement. + * @old: Currently circulating folio. + * @new: Replacement folio. + * + * Charge @new as a replacement folio for @old. @old will + * be uncharged upon free. + * + * Both folios must be locked, @new->mapping must be set up. + */ +void mem_cgroup_migrate(struct folio *old, struct folio *new) +{ + struct mem_cgroup *memcg; + long nr_pages = folio_nr_pages(new); + unsigned long flags; + + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new); + + if (mem_cgroup_disabled()) + return; + + /* Page cache replacement: new folio already charged? */ + if (folio_memcg(new)) + return; + + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); + if (!memcg) + return; + + /* Force-charge the new page. The old one will be freed soon */ + if (!mem_cgroup_is_root(memcg)) { + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + } + + css_get(&memcg->css); + commit_charge(new, memcg); + + local_irq_save(flags); + mem_cgroup_charge_statistics(memcg, nr_pages); + memcg_check_events(memcg, folio_nid(new)); + local_irq_restore(flags); +} + +DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); +EXPORT_SYMBOL(memcg_sockets_enabled_key); + +void mem_cgroup_sk_alloc(struct sock *sk) +{ + struct mem_cgroup *memcg; + + if (!mem_cgroup_sockets_enabled) + return; + + /* Do not associate the sock with unrelated interrupted task's memcg. */ + if (!in_task()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (memcg == root_mem_cgroup) + goto out; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) + goto out; + if (css_tryget(&memcg->css)) + sk->sk_memcg = memcg; +out: + rcu_read_unlock(); +} + +void mem_cgroup_sk_free(struct sock *sk) +{ + if (sk->sk_memcg) + css_put(&sk->sk_memcg->css); +} + +/** + * mem_cgroup_charge_skmem - charge socket memory + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @gfp_mask: reclaim mode + * + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if it doesn't. + */ +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + struct page_counter *fail; + + if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { + memcg->tcpmem_pressure = 0; + return true; + } + memcg->tcpmem_pressure = 1; + if (gfp_mask & __GFP_NOFAIL) { + page_counter_charge(&memcg->tcpmem, nr_pages); + return true; + } + return false; + } + + if (try_charge(memcg, gfp_mask, nr_pages) == 0) { + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); + return true; + } + + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @memcg: memcg to uncharge + * @nr_pages: number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + page_counter_uncharge(&memcg->tcpmem, nr_pages); + return; + } + + mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); + + refill_stock(memcg, nr_pages); +} + +static int __init cgroup_memory(char *s) +{ + char *token; + + while ((token = strsep(&s, ",")) != NULL) { + if (!*token) + continue; + if (!strcmp(token, "nosocket")) + cgroup_memory_nosocket = true; + if (!strcmp(token, "nokmem")) + cgroup_memory_nokmem = true; + } + return 1; +} +__setup("cgroup.memory=", cgroup_memory); + +/* + * subsys_initcall() for memory controller. + * + * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this + * context because of lock dependencies (cgroup_lock -> cpu hotplug) but + * basically everything that doesn't depend on a specific mem_cgroup structure + * should be initialized from here. + */ +static int __init mem_cgroup_init(void) +{ + int cpu, node; + + /* + * Currently s32 type (can refer to struct batched_lruvec_stat) is + * used for per-memcg-per-cpu caching of per-node statistics. In order + * to work fine, we should make sure that the overfill threshold can't + * exceed S32_MAX / PAGE_SIZE. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); + + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, + memcg_hotplug_cpu_dead); + + for_each_possible_cpu(cpu) + INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, + drain_local_stock); + + for_each_node(node) { + struct mem_cgroup_tree_per_node *rtpn; + + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE); + + rtpn->rb_root = RB_ROOT; + rtpn->rb_rightmost = NULL; + spin_lock_init(&rtpn->lock); + soft_limit_tree.rb_tree_per_node[node] = rtpn; + } + + return 0; +} +subsys_initcall(mem_cgroup_init); + +#ifdef CONFIG_SWAP +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!refcount_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @folio: folio whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @folio to @entry. + */ +void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) +{ + struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; + unsigned short oldid; + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + + if (mem_cgroup_disabled()) + return; + + if (!do_memsw_account()) + return; + + memcg = folio_memcg(folio); + + VM_WARN_ON_ONCE_FOLIO(!memcg, folio); + if (!memcg) + return; + + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + nr_entries = folio_nr_pages(folio); + /* Get references for the tail pages, too */ + if (nr_entries > 1) + mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), + nr_entries); + VM_BUG_ON_FOLIO(oldid, folio); + mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + + folio->memcg_data = 0; + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, nr_entries); + + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, nr_entries); + page_counter_uncharge(&memcg->memsw, nr_entries); + } + + /* + * Interrupts should be disabled here because the caller holds the + * i_pages lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ + memcg_stats_lock(); + mem_cgroup_charge_statistics(memcg, -nr_entries); + memcg_stats_unlock(); + memcg_check_events(memcg, folio_nid(folio)); + + css_put(&memcg->css); +} + +/** + * __mem_cgroup_try_charge_swap - try charging swap space for a folio + * @folio: folio being added to swap + * @entry: swap entry to charge + * + * Try to charge @folio's memcg for the swap space at @entry. + * + * Returns 0 on success, -ENOMEM on failure. + */ +int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) +{ + unsigned int nr_pages = folio_nr_pages(folio); + struct page_counter *counter; + struct mem_cgroup *memcg; + unsigned short oldid; + + if (do_memsw_account()) + return 0; + + memcg = folio_memcg(folio); + + VM_WARN_ON_ONCE_FOLIO(!memcg, folio); + if (!memcg) + return 0; + + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + return 0; + } + + memcg = mem_cgroup_id_get_online(memcg); + + if (!mem_cgroup_is_root(memcg) && + !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); + mem_cgroup_id_put(memcg); + return -ENOMEM; + } + + /* Get references for the tail pages, too */ + if (nr_pages > 1) + mem_cgroup_id_get_many(memcg, nr_pages - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); + VM_BUG_ON_FOLIO(oldid, folio); + mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); + + return 0; +} + +/** + * __mem_cgroup_uncharge_swap - uncharge swap space + * @entry: swap entry to uncharge + * @nr_pages: the amount of swap space to uncharge + */ +void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (mem_cgroup_disabled()) + return; + + id = swap_cgroup_record(entry, 0, nr_pages); + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg) { + if (!mem_cgroup_is_root(memcg)) { + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); + } + mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); + mem_cgroup_id_put_many(memcg, nr_pages); + } + rcu_read_unlock(); +} + +long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + long nr_swap_pages = get_nr_swap_pages(); + + if (mem_cgroup_disabled() || do_memsw_account()) + return nr_swap_pages; + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + nr_swap_pages = min_t(long, nr_swap_pages, + READ_ONCE(memcg->swap.max) - + page_counter_read(&memcg->swap)); + return nr_swap_pages; +} + +bool mem_cgroup_swap_full(struct folio *folio) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (vm_swap_full()) + return true; + if (do_memsw_account()) + return false; + + memcg = folio_memcg(folio); + if (!memcg) + return false; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) + return true; + } + + return false; +} + +static int __init setup_swap_account(char *s) +{ + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + return 1; +} +__setup("swapaccount=", setup_swap_account); + +static u64 swap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; +} + +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + +static int swap_max_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); +} + +static ssize_t swap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + xchg(&memcg->swap.max, max); + + return nbytes; +} + +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + +static struct cftype swap_files[] = { + { + .name = "swap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_current_read, + }, + { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_max_show, + .write = swap_max_write, + }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, + { } /* terminate */ +}; + +static struct cftype memsw_files[] = { + { + .name = "memsw.usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "memsw.failcnt", + .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { }, /* terminate */ +}; + +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) +/** + * obj_cgroup_may_zswap - check if this cgroup can zswap + * @objcg: the object cgroup + * + * Check if the hierarchical zswap limit has been reached. + * + * This doesn't check for specific headroom, and it is not atomic + * either. But with zswap, the size of the allocation is only known + * once compression has occured, and this optimistic pre-check avoids + * spending cycles on compression when there is already no room left + * or zswap is disabled altogether somewhere in the hierarchy. + */ +bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) +{ + struct mem_cgroup *memcg, *original_memcg; + bool ret = true; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; + + original_memcg = get_mem_cgroup_from_objcg(objcg); + for (memcg = original_memcg; memcg != root_mem_cgroup; + memcg = parent_mem_cgroup(memcg)) { + unsigned long max = READ_ONCE(memcg->zswap_max); + unsigned long pages; + + if (max == PAGE_COUNTER_MAX) + continue; + if (max == 0) { + ret = false; + break; + } + + cgroup_rstat_flush(memcg->css.cgroup); + pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; + if (pages < max) + continue; + ret = false; + break; + } + mem_cgroup_put(original_memcg); + return ret; +} + +/** + * obj_cgroup_charge_zswap - charge compression backend memory + * @objcg: the object cgroup + * @size: size of compressed object + * + * This forces the charge after obj_cgroup_may_swap() allowed + * compression and storage in zwap for this cgroup to go ahead. + */ +void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) +{ + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC)); + + /* PF_MEMALLOC context, charging must succeed */ + if (obj_cgroup_charge(objcg, GFP_KERNEL, size)) + VM_WARN_ON_ONCE(1); + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + mod_memcg_state(memcg, MEMCG_ZSWAP_B, size); + mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1); + rcu_read_unlock(); +} + +/** + * obj_cgroup_uncharge_zswap - uncharge compression backend memory + * @objcg: the object cgroup + * @size: size of compressed object + * + * Uncharges zswap memory on page in. + */ +void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size) +{ + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return; + + obj_cgroup_uncharge(objcg, size); + + rcu_read_lock(); + memcg = obj_cgroup_memcg(objcg); + mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size); + mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1); + rcu_read_unlock(); +} + +static u64 zswap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + cgroup_rstat_flush(css->cgroup); + return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B); +} + +static int zswap_max_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->zswap_max)); +} + +static ssize_t zswap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + xchg(&memcg->zswap_max, max); + + return nbytes; +} + +static struct cftype zswap_files[] = { + { + .name = "zswap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = zswap_current_read, + }, + { + .name = "zswap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = zswap_max_show, + .write = zswap_max_write, + }, + { } /* terminate */ +}; +#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ + +static int __init mem_cgroup_swap_init(void) +{ + if (mem_cgroup_disabled()) + return 0; + + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files)); +#endif + return 0; +} +subsys_initcall(mem_cgroup_swap_init); + +#endif /* CONFIG_SWAP */ diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 000000000..b0104b49b --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,344 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * We need a tag: a new tag would expand every xa_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct xa_state *xas) +{ + struct page *page; + int latency = 0; + int cache_count; + + lru_add_drain(); + + xas_lock_irq(xas); + xas_for_each(xas, page, ULONG_MAX) { + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && + page_count(page) - total_mapcount(page) != cache_count) + xas_set_mark(xas, MEMFD_TAG_PINNED); + if (cache_count != 1) + xas_set(xas, page->index + cache_count); + + latency += cache_count; + if (latency < XA_CHECK_SCHED) + continue; + latency = 0; + + xas_pause(xas); + xas_unlock_irq(xas); + cond_resched(); + xas_lock_irq(xas); + } + xas_unlock_irq(xas); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + XA_STATE(xas, &mapping->i_pages, 0); + struct page *page; + int error, scan; + + memfd_tag_pins(&xas); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + int latency = 0; + int cache_count; + + if (!xas_marked(&xas, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + xas_set(&xas, 0); + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + bool clear = true; + + cache_count = 1; + if (!xa_is_value(page) && + PageTransHuge(page) && !PageHuge(page)) + cache_count = HPAGE_PMD_NR; + + if (!xa_is_value(page) && cache_count != + page_count(page) - total_mapcount(page)) { + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + if (scan == LAST_SCAN) + error = -EBUSY; + else + clear = false; + } + if (clear) + xas_clear_mark(&xas, MEMFD_TAG_PINNED); + + latency += cache_count; + if (latency < XA_CHECK_SCHED) + continue; + latency = 0; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + file = hugetlb_file_setup(name, 0, VM_NORESERVE, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 000000000..5b846ed5d --- /dev/null +++ b/mm/memory-failure.c @@ -0,0 +1,2629 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a multi-bit ECC memory or cache + * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. + */ + +#define pr_fmt(fmt) "Memory failure: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "swap.h" +#include "internal.h" +#include "ras/ras_event.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); + +static bool hw_memory_failure __read_mostly = false; + +/* + * Return values: + * 1: the page is dissolved (if needed) and taken off from buddy, + * 0: the page is dissolved (if needed) and not taken off from buddy, + * < 0: failed to dissolve. + */ +static int __page_handle_poison(struct page *page) +{ + int ret; + + zone_pcp_disable(page_zone(page)); + ret = dissolve_free_huge_page(page); + if (!ret) + ret = take_page_off_buddy(page); + zone_pcp_enable(page_zone(page)); + + return ret; +} + +static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) +{ + if (hugepage_or_freepage) { + /* + * Doing this check for free pages is also fine since dissolve_free_huge_page + * returns 0 for non-hugetlb pages as well. + */ + if (__page_handle_poison(page) <= 0) + /* + * We could fail to take off the target page from buddy + * for example due to racy page allocation, but that's + * acceptable because soft-offlined page is not broken + * and if someone really want to use it, they should + * take it. + */ + return false; + } + + SetPageHWPoison(page); + if (release) + put_page(page); + page_ref_inc(page); + num_poisoned_pages_inc(); + + return true; +} + +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + +u32 hwpoison_filter_enable = 0; +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; +EXPORT_SYMBOL_GPL(hwpoison_filter_enable); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_MEMCG +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + if (!hwpoison_filter_memcg) + return 0; + + if (page_cgroup_ino(p) != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + +EXPORT_SYMBOL_GPL(hwpoison_filter); + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + short size_shift; +}; + +/* + * Send all the processes who have the page mapped a signal. + * ``action optional'' if they are not immediately affected by the error + * ``action required'' if error happened in current execution context + */ +static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) +{ + struct task_struct *t = tk->tsk; + short addr_lsb = tk->size_shift; + int ret = 0; + + pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); + + if ((flags & MF_ACTION_REQUIRED) && (t == current)) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + else + /* + * Signal other processes sharing the page if they have + * PF_MCE_EARLY set. + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully no one will do that? + */ + ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, + addr_lsb, t); + if (ret < 0) + pr_info("Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * Unknown page type encountered. Try to check whether it can turn PageLRU by + * lru_add_drain_all. + */ +void shake_page(struct page *p) +{ + if (PageHuge(p)) + return; + + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + + /* + * TODO: Could shrink slab caches here if a lightweight range-based + * shrinker will be available. + */ +} +EXPORT_SYMBOL_GPL(shake_page); + +static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, + unsigned long address) +{ + unsigned long ret = 0; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + VM_BUG_ON_VMA(address == -EFAULT, vma); + pgd = pgd_offset(vma->vm_mm, address); + if (!pgd_present(*pgd)) + return 0; + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return 0; + if (pud_devmap(*pud)) + return PUD_SHIFT; + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + if (pmd_devmap(*pmd)) + return PMD_SHIFT; + pte = pte_offset_map(pmd, address); + if (pte_present(*pte) && pte_devmap(*pte)) + ret = PAGE_SHIFT; + pte_unmap(pte); + return ret; +} + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +#define FSDAX_INVALID_PGOFF ULONG_MAX + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * + * Note: @fsdax_pgoff is used only when @p is a fsdax page and a + * filesystem with a memory failure handler has claimed the + * memory_failure event. In all other cases, page->index and + * page->mapping are sufficient for mapping the page back to its + * corresponding user virtual address. + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + pgoff_t fsdax_pgoff, struct vm_area_struct *vma, + struct list_head *to_kill) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + pr_err("Out of memory while machine check handling\n"); + return; + } + + tk->addr = page_address_in_vma(p, vma); + if (is_zone_device_page(p)) { + if (fsdax_pgoff != FSDAX_INVALID_PGOFF) + tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + } else + tk->size_shift = page_shift(compound_head(p)); + + /* + * Send SIGKILL if "tk->addr == -EFAULT". Also, as + * "tk->size_shift" is always non-zero for !is_zone_device_page(), + * so "tk->size_shift == 0" effectively checks no mapping on + * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times + * to a process' address space, it's possible not all N VMAs + * contain mappings for the page, but at least one VMA does. + * Only deliver SIGBUS with payload derived from the VMA that + * has a mapping for the page. + */ + if (tk->addr == -EFAULT) { + pr_info("Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + } else if (tk->size_shift == 0) { + kfree(tk); + return; + } + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when FORCEKILL is set, otherwise just free the + * list (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, + unsigned long pfn, int flags) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe(tk, next, to_kill, nd) { + if (forcekill) { + /* + * In case something went wrong with munmapping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + */ + if (fail || tk->addr == -EFAULT) { + pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, + tk->tsk, PIDTYPE_PID); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc(tk, pfn, flags) < 0) + pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + list_del(&tk->nd); + put_task_struct(tk->tsk); + kfree(tk); + } +} + +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) { + if (t->flags & PF_MCE_PROCESS) { + if (t->flags & PF_MCE_EARLY) + return t; + } else { + if (sysctl_memory_failure_early_kill) + return t; + } + } + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill" and otherwise returns NULL. + * + * Note that the above is true for Action Optional case. For Action Required + * case, it's only meaningful to the current thread which need to be signaled + * with SIGBUS, this error is Action Optional for other non current + * processes sharing the same error page,if the process is "early kill", the + * task_struct of the dedicated thread will also be returned. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) +{ + if (!tsk->mm) + return NULL; + /* + * Comparing ->mm here because current task might represent + * a subthread, while tsk always points to the main thread. + */ + if (force_early && tsk->mm == current->mm) + return current; + + return find_early_kill_thread(tsk); +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + int force_early) +{ + struct folio *folio = page_folio(page); + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + pgoff_t pgoff; + + av = folio_lock_anon_vma_read(folio, NULL); + if (av == NULL) /* Not actually mapped anymore */ + return; + + pgoff = page_to_pgoff(page); + read_lock(&tasklist_lock); + for_each_process (tsk) { + struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); + + if (!t) + continue; + anon_vma_interval_tree_foreach(vmac, &av->rb_root, + pgoff, pgoff) { + vma = vmac->vma; + if (vma->vm_mm != t->mm) + continue; + if (!page_mapped_in_vma(page, vma)) + continue; + add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, to_kill); + } + } + read_unlock(&tasklist_lock); + anon_vma_unlock_read(av); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + int force_early) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct address_space *mapping = page->mapping; + pgoff_t pgoff; + + i_mmap_lock_read(mapping); + read_lock(&tasklist_lock); + pgoff = page_to_pgoff(page); + for_each_process(tsk) { + struct task_struct *t = task_early_kill(tsk, force_early); + + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == t->mm) + add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, + to_kill); + } + } + read_unlock(&tasklist_lock); + i_mmap_unlock_read(mapping); +} + +#ifdef CONFIG_FS_DAX +/* + * Collect processes when the error hit a fsdax page. + */ +static void collect_procs_fsdax(struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + read_lock(&tasklist_lock); + for_each_process(tsk) { + struct task_struct *t = task_early_kill(tsk, true); + + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_mm == t->mm) + add_to_kill(t, page, pgoff, vma, to_kill); + } + } + read_unlock(&tasklist_lock); + i_mmap_unlock_read(mapping); +} +#endif /* CONFIG_FS_DAX */ + +/* + * Collect the processes who have the corrupted page mapped to kill. + */ +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) +{ + if (!page->mapping) + return; + + if (PageAnon(page)) + collect_procs_anon(page, tokill, force_early); + else + collect_procs_file(page, tokill, force_early); +} + +struct hwp_walk { + struct to_kill tk; + unsigned long pfn; + int flags; +}; + +static void set_to_kill(struct to_kill *tk, unsigned long addr, short shift) +{ + tk->addr = addr; + tk->size_shift = shift; +} + +static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, + unsigned long poisoned_pfn, struct to_kill *tk) +{ + unsigned long pfn = 0; + + if (pte_present(pte)) { + pfn = pte_pfn(pte); + } else { + swp_entry_t swp = pte_to_swp_entry(pte); + + if (is_hwpoison_entry(swp)) + pfn = swp_offset_pfn(swp); + } + + if (!pfn || pfn != poisoned_pfn) + return 0; + + set_to_kill(tk, addr, shift); + return 1; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + pmd_t pmd = *pmdp; + unsigned long pfn; + unsigned long hwpoison_vaddr; + + if (!pmd_present(pmd)) + return 0; + pfn = pmd_pfn(pmd); + if (pfn <= hwp->pfn && hwp->pfn < pfn + HPAGE_PMD_NR) { + hwpoison_vaddr = addr + ((hwp->pfn - pfn) << PAGE_SHIFT); + set_to_kill(&hwp->tk, hwpoison_vaddr, PAGE_SHIFT); + return 1; + } + return 0; +} +#else +static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr, + struct hwp_walk *hwp) +{ + return 0; +} +#endif + +static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct hwp_walk *hwp = walk->private; + int ret = 0; + pte_t *ptep, *mapped_pte; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmdp, walk->vma); + if (ptl) { + ret = check_hwpoisoned_pmd_entry(pmdp, addr, hwp); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmdp)) + goto out; + + mapped_pte = ptep = pte_offset_map_lock(walk->vma->vm_mm, pmdp, + addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + ret = check_hwpoisoned_entry(*ptep, addr, PAGE_SHIFT, + hwp->pfn, &hwp->tk); + if (ret == 1) + break; + } + pte_unmap_unlock(mapped_pte, ptl); +out: + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hwp_walk *hwp = walk->private; + pte_t pte = huge_ptep_get(ptep); + struct hstate *h = hstate_vma(walk->vma); + + return check_hwpoisoned_entry(pte, addr, huge_page_shift(h), + hwp->pfn, &hwp->tk); +} +#else +#define hwpoison_hugetlb_range NULL +#endif + +static const struct mm_walk_ops hwp_walk_ops = { + .pmd_entry = hwpoison_pte_range, + .hugetlb_entry = hwpoison_hugetlb_range, +}; + +/* + * Sends SIGBUS to the current process with error info. + * + * This function is intended to handle "Action Required" MCEs on already + * hardware poisoned pages. They could happen, for example, when + * memory_failure() failed to unmap the error page at the first call, or + * when multiple local machine checks happened on different CPUs. + * + * MCE handler currently has no easy access to the error virtual address, + * so this function walks page table to find it. The returned virtual address + * is proper in most cases, but it could be wrong when the application + * process has multiple entries mapping the error page. + */ +static int kill_accessing_process(struct task_struct *p, unsigned long pfn, + int flags) +{ + int ret; + struct hwp_walk priv = { + .pfn = pfn, + }; + priv.tk.tsk = p; + + if (!p->mm) + return -EFAULT; + + mmap_read_lock(p->mm); + ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, + (void *)&priv); + if (ret == 1 && priv.tk.addr) + kill_proc(&priv.tk, pfn, flags); + else + ret = 0; + mmap_read_unlock(p->mm); + return ret > 0 ? -EHWPOISON : -EFAULT; +} + +static const char *action_name[] = { + [MF_IGNORED] = "Ignored", + [MF_FAILED] = "Failed", + [MF_DELAYED] = "Delayed", + [MF_RECOVERED] = "Recovered", +}; + +static const char * const action_page_types[] = { + [MF_MSG_KERNEL] = "reserved kernel page", + [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MF_MSG_SLAB] = "kernel slab page", + [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MF_MSG_HUGE] = "huge page", + [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_UNMAP_FAILED] = "unmapping failed page", + [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MF_MSG_DIRTY_LRU] = "dirty LRU page", + [MF_MSG_CLEAN_LRU] = "clean LRU page", + [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MF_MSG_BUDDY] = "free buddy page", + [MF_MSG_DAX] = "dax page", + [MF_MSG_UNSPLIT_THP] = "unsplit thp", + [MF_MSG_UNKNOWN] = "unknown page", +}; + +/* + * XXX: It is possible that a page is isolated from LRU cache, + * and then kept in swap cache or failed to remove from page cache. + * The page count will stop it from being freed by unpoison. + * Stress tests should be aware of this memory leak problem. + */ +static int delete_from_lru_cache(struct page *p) +{ + if (!isolate_lru_page(p)) { + /* + * Clear sensible page flags, so that the buddy system won't + * complain when the page is unpoison-and-freed. + */ + ClearPageActive(p); + ClearPageUnevictable(p); + + /* + * Poisoned page might never drop its ref count to 0 so we have + * to uncharge it manually from its memcg. + */ + mem_cgroup_uncharge(page_folio(p)); + + /* + * drop the page count elevated by isolate_lru_page() + */ + put_page(p); + return 0; + } + return -EIO; +} + +static int truncate_error_page(struct page *p, unsigned long pfn, + struct address_space *mapping) +{ + int ret = MF_FAILED; + + if (mapping->a_ops->error_remove_page) { + struct folio *folio = page_folio(p); + int err = mapping->a_ops->error_remove_page(mapping, p); + + if (err != 0) + pr_info("%#lx: Failed to punch page: %d\n", pfn, err); + else if (!filemap_release_folio(folio, GFP_NOIO)) + pr_info("%#lx: failed to release buffers\n", pfn); + else + ret = MF_RECOVERED; + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = MF_RECOVERED; + else + pr_info("%#lx: Failed to invalidate\n", pfn); + } + + return ret; +} + +struct page_state { + unsigned long mask; + unsigned long res; + enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ + int (*action)(struct page_state *ps, struct page *p); +}; + +/* + * Return true if page is still referenced by others, otherwise return + * false. + * + * The extra_pins is true when one extra refcount is expected. + */ +static bool has_extra_refcount(struct page_state *ps, struct page *p, + bool extra_pins) +{ + int count = page_count(p) - 1; + + if (extra_pins) + count -= 1; + + if (count > 0) { + pr_err("%#lx: %s still referenced by %d users\n", + page_to_pfn(p), action_page_types[ps->type], count); + return true; + } + + return false; +} + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page_state *ps, struct page *p) +{ + unlock_page(p); + return MF_IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page_state *ps, struct page *p) +{ + pr_err("%#lx: Unknown page state\n", page_to_pfn(p)); + unlock_page(p); + return MF_FAILED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page_state *ps, struct page *p) +{ + int ret; + struct address_space *mapping; + bool extra_pins; + + delete_from_lru_cache(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) { + ret = MF_RECOVERED; + goto out; + } + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + ret = MF_FAILED; + goto out; + } + + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_rwsem or not for this? Right now we don't. + */ + ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + +out: + unlock_page(p); + + return ret; +} + +/* + * Dirty pagecache page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page_state *ps, struct page *p) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped between then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, -EIO); + } + + return me_pagecache_clean(ps, p); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(!TTU_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page_state *ps, struct page *p) +{ + int ret; + bool extra_pins = false; + + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + unlock_page(p); + + if (ret == MF_DELAYED) + extra_pins = true; + + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + + return ret; +} + +static int me_swapcache_clean(struct page_state *ps, struct page *p) +{ + struct folio *folio = page_folio(p); + int ret; + + delete_from_swap_cache(folio); + + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + folio_unlock(folio); + + if (has_extra_refcount(ps, p, false)) + ret = MF_FAILED; + + return ret; +} + +/* + * Huge pages. Needs work. + * Issues: + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. + */ +static int me_huge_page(struct page_state *ps, struct page *p) +{ + int res; + struct page *hpage = compound_head(p); + struct address_space *mapping; + bool extra_pins = false; + + if (!PageHuge(hpage)) + return MF_DELAYED; + + mapping = page_mapping(hpage); + if (mapping) { + res = truncate_error_page(hpage, page_to_pfn(p), mapping); + /* The page is kept in page cache. */ + extra_pins = true; + unlock_page(hpage); + } else { + unlock_page(hpage); + /* + * migration entry prevents later access on error hugepage, + * so we can free and dissolve it into buddy to save healthy + * subpages. + */ + put_page(hpage); + if (__page_handle_poison(p) >= 0) { + page_ref_inc(p); + res = MF_RECOVERED; + } else { + res = MF_FAILED; + } + } + + if (has_extra_refcount(ps, p, extra_pins)) + res = MF_FAILED; + + return res; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremely careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked)) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define lru (1UL << PG_lru) +#define head (1UL << PG_head) +#define slab (1UL << PG_slab) +#define reserved (1UL << PG_reserved) + +static struct page_state error_states[] = { + { reserved, reserved, MF_MSG_KERNEL, me_kernel }, + /* + * free pages are specially detected outside this table: + * PG_buddy pages only make a small fraction of all free pages. + */ + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, MF_MSG_SLAB, me_kernel }, + + { head, head, MF_MSG_HUGE, me_huge_page }, + + { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, + + { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, + + { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, + + { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, MF_MSG_UNKNOWN, me_unknown }, +}; + +#undef dirty +#undef sc +#undef unevict +#undef mlock +#undef lru +#undef head +#undef slab +#undef reserved + +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ +static void action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) +{ + trace_memory_failure_event(pfn, type, result); + + num_poisoned_pages_inc(); + pr_err("%#lx: recovery action for %s: %s\n", + pfn, action_page_types[type], action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn) +{ + int result; + + /* page p should be unlocked after returning from ps->action(). */ + result = ps->action(ps, p); + + action_result(pfn, ps->type, result); + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; +} + +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + +/* + * Return true if a page type of a given page is supported by hwpoison + * mechanism (while handling could fail), otherwise false. This function + * does not return true for hugetlb or device memory pages, so it's assumed + * to be called only in the context where we never have such pages. + */ +static inline bool HWPoisonHandlable(struct page *page, unsigned long flags) +{ + /* Soft offline could migrate non-LRU movable pages */ + if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page)) + return true; + + return PageLRU(page) || is_free_buddy_page(page); +} + +static int __get_hwpoison_page(struct page *page, unsigned long flags) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * This check prevents from calling get_page_unless_zero() for any + * unsupported type of page in order to reduce the risk of unexpected + * races caused by taking a page refcount. + */ + if (!HWPoisonHandlable(head, flags)) + return -EBUSY; + + if (get_page_unless_zero(head)) { + if (head == compound_head(page)) + return 1; + + pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); + put_page(head); + } + + return 0; +} + +static int get_any_page(struct page *p, unsigned long flags) +{ + int ret = 0, pass = 0; + bool count_increased = false; + + if (flags & MF_COUNT_INCREASED) + count_increased = true; + +try_again: + if (!count_increased) { + ret = __get_hwpoison_page(p, flags); + if (!ret) { + if (page_count(p)) { + /* We raced with an allocation, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EBUSY; + } else if (!PageHuge(p) && !is_free_buddy_page(p)) { + /* We raced with put_page, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EIO; + } + goto out; + } else if (ret == -EBUSY) { + /* + * We raced with (possibly temporary) unhandlable + * page, retry. + */ + if (pass++ < 3) { + shake_page(p); + goto try_again; + } + ret = -EIO; + goto out; + } + } + + if (PageHuge(p) || HWPoisonHandlable(p, flags)) { + ret = 1; + } else { + /* + * A page we cannot handle. Check whether we can turn + * it into something we can handle. + */ + if (pass++ < 3) { + put_page(p); + shake_page(p); + count_increased = false; + goto try_again; + } + put_page(p); + ret = -EIO; + } +out: + if (ret == -EIO) + pr_err("%#lx: unhandlable page.\n", page_to_pfn(p)); + + return ret; +} + +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + +/** + * get_hwpoison_page() - Get refcount for memory error handling + * @p: Raw error page (hit by memory error) + * @flags: Flags controlling behavior of error handling + * + * get_hwpoison_page() takes a page refcount of an error page to handle memory + * error on it, after checking that the error page is in a well-defined state + * (defined as a page-type we can successfully handle the memory error on it, + * such as LRU page and hugetlb page). + * + * Memory error handling could be triggered at any time on any type of page, + * so it's prone to race with typical memory management lifecycle (like + * allocation and free). So to avoid such races, get_hwpoison_page() takes + * extra care for the error page's state (as done in __get_hwpoison_page()), + * and has some retry logic in get_any_page(). + * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * + * Return: 0 on failure, + * 1 on success for in-use pages in a well-defined state, + * -EIO for pages on which we can not handle memory errors, + * -EBUSY when get_hwpoison_page() has raced with page lifecycle + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. + */ +static int get_hwpoison_page(struct page *p, unsigned long flags) +{ + int ret; + + zone_pcp_disable(page_zone(p)); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); + zone_pcp_enable(page_zone(p)); + + return ret; +} + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, + int flags, struct page *hpage) +{ + struct folio *folio = page_folio(hpage); + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON; + struct address_space *mapping; + LIST_HEAD(tokill); + bool unmap_success; + int forcekill; + bool mlocked = PageMlocked(hpage); + + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ + if (PageReserved(p) || PageSlab(p) || PageTable(p)) + return true; + if (!(PageLRU(hpage) || PageHuge(p))) + return true; + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(p)) + return true; + + if (PageKsm(p)) { + pr_err("%#lx: can't handle KSM pages.\n", pfn); + return false; + } + + if (PageSwapCache(p)) { + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); + ttu &= ~TTU_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = page_mapping(hpage); + if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping && + mapping_can_writeback(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); + } else { + ttu &= ~TTU_HWPOISON; + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + */ + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); + + if (PageHuge(hpage) && !PageAnon(hpage)) { + /* + * For hugetlb pages in shared mappings, try_to_unmap + * could potentially call huge_pmd_unshare. Because of + * this, take semaphore in write mode here and set + * TTU_RMAP_LOCKED to indicate we have taken the lock + * at this higher level. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (mapping) { + try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); + i_mmap_unlock_write(mapping); + } else + pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); + } else { + try_to_unmap(folio, ttu); + } + + unmap_success = !page_mapped(p); + if (!unmap_success) + pr_err("%#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(p)); + + /* + * try_to_unmap() might put mlocked page in lru cache, so call + * shake_page() again to ensure that it's flushed. + */ + if (mlocked) + shake_page(hpage); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty or the process is not restartable, + * otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + !unmap_success; + kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); + + return unmap_success; +} + +static int identify_page_state(unsigned long pfn, struct page *p, + unsigned long page_flags) +{ + struct page_state *ps; + + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flags is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) + break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + return page_action(ps, p, pfn); +} + +static int try_to_split_thp_page(struct page *page) +{ + int ret; + + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + + if (unlikely(ret)) + put_page(page); + + return ret; +} + +static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, + struct address_space *mapping, pgoff_t index, int flags) +{ + struct to_kill *tk; + unsigned long size = 0; + + list_for_each_entry(tk, to_kill, nd) + if (tk->size_shift) + size = max(size, 1UL << tk->size_shift); + + if (size) { + /* + * Unmap the largest mapping to avoid breaking up device-dax + * mappings which are constant size. The actual size of the + * mapping being torn down is communicated in siginfo, see + * kill_proc() + */ + loff_t start = ((loff_t)index << PAGE_SHIFT) & ~(size - 1); + + unmap_mapping_range(mapping, start, size, 0); + } + + kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); +} + +static int mf_generic_kill_procs(unsigned long long pfn, int flags, + struct dev_pagemap *pgmap) +{ + struct page *page = pfn_to_page(pfn); + LIST_HEAD(to_kill); + dax_entry_t cookie; + int rc = 0; + + /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + + /* + * Prevent the inode from being freed while we are interrogating + * the address_space, typically this would be handled by + * lock_page(), but dax pages do not use the page lock. This + * also prevents changes to the mapping of this pfn until + * poison signaling is complete. + */ + cookie = dax_lock_page(page); + if (!cookie) + return -EBUSY; + + if (hwpoison_filter(page)) { + rc = -EOPNOTSUPP; + goto unlock; + } + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: + /* + * TODO: Handle device pages which may need coordination + * with device-side memory. + */ + rc = -ENXIO; + goto unlock; + default: + break; + } + + /* + * Use this flag as an indication that the dax page has been + * remapped UC to prevent speculative consumption of poison. + */ + SetPageHWPoison(page); + + /* + * Unlike System-RAM there is no possibility to swap in a + * different physical page at a given virtual address, so all + * userspace consumption of ZONE_DEVICE memory necessitates + * SIGBUS (i.e. MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + collect_procs(page, &to_kill, true); + + unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags); +unlock: + dax_unlock_page(page, cookie); + return rc; +} + +#ifdef CONFIG_FS_DAX +/** + * mf_dax_kill_procs - Collect and kill processes who are using this file range + * @mapping: address_space of the file in use + * @index: start pgoff of the range within the file + * @count: length of the range, in unit of PAGE_SIZE + * @mf_flags: memory failure flags + */ +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags) +{ + LIST_HEAD(to_kill); + dax_entry_t cookie; + struct page *page; + size_t end = index + count; + + mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + for (; index < end; index++) { + page = NULL; + cookie = dax_lock_mapping_entry(mapping, index, &page); + if (!cookie) + return -EBUSY; + if (!page) + goto unlock; + + SetPageHWPoison(page); + + collect_procs_fsdax(page, mapping, index, &to_kill); + unmap_and_kill(&to_kill, page_to_pfn(page), mapping, + index, mf_flags); +unlock: + dax_unlock_mapping_entry(mapping, index, cookie); + } + return 0; +} +EXPORT_SYMBOL_GPL(mf_dax_kill_procs); +#endif /* CONFIG_FS_DAX */ + +#ifdef CONFIG_HUGETLB_PAGE +/* + * Struct raw_hwp_page represents information about "raw error page", + * constructing singly linked list originated from ->private field of + * SUBPAGE_INDEX_HWPOISON-th tail page. + */ +struct raw_hwp_page { + struct llist_node node; + struct page *page; +}; + +static inline struct llist_head *raw_hwp_list_head(struct page *hpage) +{ + return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); +} + +static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) +{ + struct llist_head *head; + struct llist_node *t, *tnode; + unsigned long count = 0; + + head = raw_hwp_list_head(hpage); + llist_for_each_safe(tnode, t, head->first) { + struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); + + if (move_flag) + SetPageHWPoison(p->page); + kfree(p); + count++; + } + llist_del_all(head); + return count; +} + +static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) +{ + struct llist_head *head; + struct raw_hwp_page *raw_hwp; + struct llist_node *t, *tnode; + int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; + + /* + * Once the hwpoison hugepage has lost reliable raw error info, + * there is little meaning to keep additional error info precisely, + * so skip to add additional raw error info. + */ + if (HPageRawHwpUnreliable(hpage)) + return -EHWPOISON; + head = raw_hwp_list_head(hpage); + llist_for_each_safe(tnode, t, head->first) { + struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); + + if (p->page == page) + return -EHWPOISON; + } + + raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); + if (raw_hwp) { + raw_hwp->page = page; + llist_add(&raw_hwp->node, head); + /* the first error event will be counted in action_result(). */ + if (ret) + num_poisoned_pages_inc(); + } else { + /* + * Failed to save raw error info. We no longer trace all + * hwpoisoned subpages, and we need refuse to free/dissolve + * this hwpoisoned hugepage. + */ + SetHPageRawHwpUnreliable(hpage); + /* + * Once HPageRawHwpUnreliable is set, raw_hwp_page is not + * used any more, so free it. + */ + __free_raw_hwp_pages(hpage, false); + } + return ret; +} + +static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) +{ + /* + * HPageVmemmapOptimized hugepages can't be freed because struct + * pages for tail pages are required but they don't exist. + */ + if (move_flag && HPageVmemmapOptimized(hpage)) + return 0; + + /* + * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by + * definition. + */ + if (HPageRawHwpUnreliable(hpage)) + return 0; + + return __free_raw_hwp_pages(hpage, move_flag); +} + +void hugetlb_clear_page_hwpoison(struct page *hpage) +{ + if (HPageRawHwpUnreliable(hpage)) + return; + ClearPageHWPoison(hpage); + free_raw_hwp_pages(hpage, true); +} + +/* + * Called from hugetlb code with hugetlb_lock held. + * + * Return values: + * 0 - free hugepage + * 1 - in-use hugepage + * 2 - not a hugepage + * -EBUSY - the hugepage is busy (try to retry) + * -EHWPOISON - the hugepage is already hwpoisoned + */ +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +{ + struct page *page = pfn_to_page(pfn); + struct page *head = compound_head(page); + int ret = 2; /* fallback to normal page handling */ + bool count_increased = false; + + if (!PageHeadHuge(head)) + goto out; + + if (flags & MF_COUNT_INCREASED) { + ret = 1; + count_increased = true; + } else if (HPageFreed(head)) { + ret = 0; + } else if (HPageMigratable(head)) { + ret = get_page_unless_zero(head); + if (ret) + count_increased = true; + } else { + ret = -EBUSY; + if (!(flags & MF_NO_RETRY)) + goto out; + } + + if (hugetlb_set_page_hwpoison(head, page)) { + ret = -EHWPOISON; + goto out; + } + + return ret; +out: + if (count_increased) + put_page(head); + return ret; +} + +/* + * Taking refcount of hugetlb pages needs extra care about race conditions + * with basic operations like hugepage allocation/free/demotion. + * So some of prechecks for hwpoison (pinning, and testing/setting + * PageHWPoison) should be done in single hugetlb_lock range. + */ +static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + int res; + struct page *p = pfn_to_page(pfn); + struct page *head; + unsigned long page_flags; + + *hugetlb = 1; +retry: + res = get_huge_page_for_hwpoison(pfn, flags); + if (res == 2) { /* fallback to normal page handling */ + *hugetlb = 0; + return 0; + } else if (res == -EHWPOISON) { + pr_err("%#lx: already hardware poisoned\n", pfn); + if (flags & MF_ACTION_REQUIRED) { + head = compound_head(p); + res = kill_accessing_process(current, page_to_pfn(head), flags); + } + return res; + } else if (res == -EBUSY) { + if (!(flags & MF_NO_RETRY)) { + flags |= MF_NO_RETRY; + goto retry; + } + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + return res; + } + + head = compound_head(p); + lock_page(head); + + if (hwpoison_filter(p)) { + hugetlb_clear_page_hwpoison(head); + unlock_page(head); + if (res == 1) + put_page(head); + return -EOPNOTSUPP; + } + + /* + * Handling free hugepage. The possible race with hugepage allocation + * or demotion can be prevented by PageHWPoison flag. + */ + if (res == 0) { + unlock_page(head); + if (__page_handle_poison(p) >= 0) { + page_ref_inc(p); + res = MF_RECOVERED; + } else { + res = MF_FAILED; + } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; + } + + page_flags = head->flags; + + if (!hwpoison_user_mappings(p, pfn, flags, head)) { + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + res = -EBUSY; + goto out; + } + + return identify_page_state(pfn, p, page_flags); +out: + unlock_page(head); + return res; +} + +#else +static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) +{ + return 0; +} + +static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) +{ + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static int memory_failure_dev_pagemap(unsigned long pfn, int flags, + struct dev_pagemap *pgmap) +{ + struct page *page = pfn_to_page(pfn); + int rc = -ENXIO; + + if (flags & MF_COUNT_INCREASED) + /* + * Drop the extra refcount in case we come from madvise(). + */ + put_page(page); + + /* device metadata space is not recoverable */ + if (!pgmap_pfn_valid(pgmap, pfn)) + goto out; + + /* + * Call driver's implementation to handle the memory failure, otherwise + * fall back to generic handler. + */ + if (pgmap_has_memory_failure(pgmap)) { + rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); + /* + * Fall back to generic handler too if operation is not + * supported inside the driver/device/filesystem. + */ + if (rc != -EOPNOTSUPP) + goto out; + } + + rc = mf_generic_kill_procs(pfn, flags, pgmap); +out: + /* drop pgmap ref acquired in caller */ + put_dev_pagemap(pgmap); + action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); + return rc; +} + +static DEFINE_MUTEX(mf_mutex); + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + * + * Return: 0 for successfully handled the memory error, + * -EOPNOTSUPP for hwpoison_filter() filtered the error event, + * < 0(except -EOPNOTSUPP) on failure. + */ +int memory_failure(unsigned long pfn, int flags) +{ + struct page *p; + struct page *hpage; + struct dev_pagemap *pgmap; + int res = 0; + unsigned long page_flags; + bool retry = true; + int hugetlb = 0; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure on page %lx", pfn); + + mutex_lock(&mf_mutex); + + if (!(flags & MF_SW_SIMULATED)) + hw_memory_failure = true; + + p = pfn_to_online_page(pfn); + if (!p) { + res = arch_memory_failure(pfn, flags); + if (res == 0) + goto unlock_mutex; + + if (pfn_valid(pfn)) { + pgmap = get_dev_pagemap(pfn, NULL); + if (pgmap) { + res = memory_failure_dev_pagemap(pfn, flags, + pgmap); + goto unlock_mutex; + } + } + pr_err("%#lx: memory outside kernel control\n", pfn); + res = -ENXIO; + goto unlock_mutex; + } + +try_again: + res = try_memory_failure_hugetlb(pfn, flags, &hugetlb); + if (hugetlb) + goto unlock_mutex; + + if (TestSetPageHWPoison(p)) { + pr_err("%#lx: already hardware poisoned\n", pfn); + res = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + res = kill_accessing_process(current, pfn, flags); + if (flags & MF_COUNT_INCREASED) + put_page(p); + goto unlock_mutex; + } + + hpage = compound_head(p); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * check_new_page() will be the gate keeper. + * 2) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. + */ + if (!(flags & MF_COUNT_INCREASED)) { + res = get_hwpoison_page(p, flags); + if (!res) { + if (is_free_buddy_page(p)) { + if (take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } else { + /* We lost the race, try again */ + if (retry) { + ClearPageHWPoison(p); + retry = false; + goto try_again; + } + res = MF_FAILED; + } + action_result(pfn, MF_MSG_BUDDY, res); + res = res == MF_RECOVERED ? 0 : -EBUSY; + } else { + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + res = -EBUSY; + } + goto unlock_mutex; + } else if (res < 0) { + action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); + res = -EBUSY; + goto unlock_mutex; + } + } + + if (PageTransHuge(hpage)) { + /* + * The flag must be set after the refcount is bumped + * otherwise it may race with THP split. + * And the flag can't be set in get_hwpoison_page() since + * it is called by soft offline too and it is just called + * for !MF_COUNT_INCREASE. So here seems to be the best + * place. + * + * Don't need care about the above error handling paths for + * get_hwpoison_page() since they handle either free page + * or unhandlable page. The refcount is bumped iff the + * page is a valid handlable page. + */ + SetPageHasHWPoisoned(hpage); + if (try_to_split_thp_page(p) < 0) { + action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); + res = -EBUSY; + goto unlock_mutex; + } + VM_BUG_ON_PAGE(!page_count(p), p); + } + + /* + * We ignore non-LRU pages for good reasons. + * - PG_locked is only well defined for LRU pages and a few others + * - to avoid races with __SetPageLocked() + * - to avoid races with __SetPageSlab*() (and more non-atomic ops) + * The check (unnecessarily) ignores LRU pages being isolated and + * walked by the page reclaim code, however that's not a big loss. + */ + shake_page(p); + + lock_page(p); + + /* + * We're only intended to deal with the non-Compound page here. + * However, the page could have changed compound pages due to + * race window. If this happens, we could try again to hopefully + * handle the page next round. + */ + if (PageCompound(p)) { + if (retry) { + ClearPageHWPoison(p); + unlock_page(p); + put_page(p); + flags &= ~MF_COUNT_INCREASED; + retry = false; + goto try_again; + } + action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); + res = -EBUSY; + goto unlock_page; + } + + /* + * We use page flags to determine what action should be taken, but + * the flags can be modified by the error containment action. One + * example is an mlocked page, where PG_mlocked is cleared by + * page_remove_rmap() in try_to_unmap_one(). So to determine page status + * correctly, we save a copy of the page flags at this time. + */ + page_flags = p->flags; + + if (hwpoison_filter(p)) { + ClearPageHWPoison(p); + unlock_page(p); + put_page(p); + res = -EOPNOTSUPP; + goto unlock_mutex; + } + + /* + * __munlock_pagevec may clear a writeback page's LRU flag without + * page_lock. We need wait writeback completion for this page or it + * may trigger vfs BUG while evict inode. + */ + if (!PageLRU(p) && !PageWriteback(p)) + goto identify_page_state; + + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + * Abort on fail: __filemap_remove_folio() assumes unmapped page. + */ + if (!hwpoison_user_mappings(p, pfn, flags, p)) { + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + res = -EBUSY; + goto unlock_page; + } + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); + res = -EBUSY; + goto unlock_page; + } + +identify_page_state: + res = identify_page_state(pfn, p, page_flags); + mutex_unlock(&mf_mutex); + return res; +unlock_page: + unlock_page(p); +unlock_mutex: + mutex_unlock(&mf_mutex); + return res; +} +EXPORT_SYMBOL_GPL(memory_failure); + +#define MEMORY_FAILURE_FIFO_ORDER 4 +#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { + unsigned long pfn; + int flags; +}; + +struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); + spinlock_t lock; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); + +/** + * memory_failure_queue - Schedule handling memory failure of a page. + * @pfn: Page Number of the corrupted page + * @flags: Flags for memory failure handling + * + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Can run in IRQ context. + */ +void memory_failure_queue(unsigned long pfn, int flags) +{ + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; + struct memory_failure_entry entry = { + .pfn = pfn, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + if (kfifo_put(&mf_cpu->fifo, entry)) + schedule_work_on(smp_processor_id(), &mf_cpu->work); + else + pr_err("buffer overflow when queuing memory failure at %#lx\n", + pfn); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ + struct memory_failure_cpu *mf_cpu; + struct memory_failure_entry entry = { 0, }; + unsigned long proc_flags; + int gotten; + + mf_cpu = container_of(work, struct memory_failure_cpu, work); + for (;;) { + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + if (entry.flags & MF_SOFT_OFFLINE) + soft_offline_page(entry.pfn, entry.flags); + else + memory_failure(entry.pfn, entry.flags); + } +} + +/* + * Process memory_failure work queued on the specified CPU. + * Used to avoid return-to-userspace racing with the memory_failure workqueue. + */ +void memory_failure_queue_kick(int cpu) +{ + struct memory_failure_cpu *mf_cpu; + + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + cancel_work_sync(&mf_cpu->work); + memory_failure_work_func(&mf_cpu->work); +} + +static int __init memory_failure_init(void) +{ + struct memory_failure_cpu *mf_cpu; + int cpu; + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } + + return 0; +} +core_initcall(memory_failure_init); + +#undef pr_fmt +#define pr_fmt(fmt) "" fmt +#define unpoison_pr_info(fmt, pfn, rs) \ +({ \ + if (__ratelimit(rs)) \ + pr_info(fmt, pfn); \ +}) + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int ret = -EBUSY; + int freeit = 0; + unsigned long count = 1; + static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + mutex_lock(&mf_mutex); + + if (hw_memory_failure) { + unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n", + pfn, &unpoison_rs); + ret = -EOPNOTSUPP; + goto unlock_mutex; + } + + if (!PageHWPoison(p)) { + unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", + pfn, &unpoison_rs); + goto unlock_mutex; + } + + if (page_count(page) > 1) { + unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", + pfn, &unpoison_rs); + goto unlock_mutex; + } + + if (page_mapped(page)) { + unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", + pfn, &unpoison_rs); + goto unlock_mutex; + } + + if (page_mapping(page)) { + unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", + pfn, &unpoison_rs); + goto unlock_mutex; + } + + if (PageSlab(page) || PageTable(page) || PageReserved(page)) + goto unlock_mutex; + + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (PageHuge(p)) { + count = free_raw_hwp_pages(page, false); + if (count == 0) { + ret = -EBUSY; + goto unlock_mutex; + } + } + ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = put_page_back_buddy(p) ? 0 : -EBUSY; + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + if (PageHuge(p)) { + count = free_raw_hwp_pages(page, false); + if (count == 0) { + ret = -EBUSY; + put_page(page); + goto unlock_mutex; + } + } + freeit = !!TestClearPageHWPoison(p); + + put_page(page); + if (freeit) { + put_page(page); + ret = 0; + } + } + +unlock_mutex: + mutex_unlock(&mf_mutex); + if (!ret || freeit) { + num_poisoned_pages_sub(count); + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), &unpoison_rs); + } + return ret; +} +EXPORT_SYMBOL(unpoison_memory); + +static bool isolate_page(struct page *page, struct list_head *pagelist) +{ + bool isolated = false; + + if (PageHuge(page)) { + isolated = !isolate_hugetlb(page, pagelist); + } else { + bool lru = !__PageMovable(page); + + if (lru) + isolated = !isolate_lru_page(page); + else + isolated = !isolate_movable_page(page, + ISOLATE_UNEVICTABLE); + + if (isolated) { + list_add(&page->lru, pagelist); + if (lru) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + } + } + + /* + * If we succeed to isolate the page, we grabbed another refcount on + * the page, so we can safely drop the one we got from get_any_pages(). + * If we failed to isolate the page, it means that we cannot go further + * and we will return an error, so drop the reference we got from + * get_any_pages() as well. + */ + put_page(page); + return isolated; +} + +/* + * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. + * If the page is a non-dirty unmapped page-cache page, it simply invalidates. + * If the page is mapped, it migrates the contents over. + */ +static int soft_offline_in_use_page(struct page *page) +{ + long ret = 0; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + char const *msg_page[] = {"page", "hugepage"}; + bool huge = PageHuge(page); + LIST_HEAD(pagelist); + struct migration_target_control mtc = { + .nid = NUMA_NO_NODE, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + if (!huge && PageTransHuge(hpage)) { + if (try_to_split_thp_page(page)) { + pr_info("soft offline: %#lx: thp split failed\n", pfn); + return -EBUSY; + } + hpage = page; + } + + lock_page(page); + if (!PageHuge(page)) + wait_on_page_writeback(page); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return 0; + } + + if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page)) + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + + if (ret) { + pr_info("soft_offline: %#lx: invalidated\n", pfn); + page_handle_poison(page, false, true); + return 0; + } + + if (isolate_page(hpage, &pagelist)) { + ret = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL); + if (!ret) { + bool release = !huge; + + if (!page_handle_poison(page, huge, release)) + ret = -EBUSY; + } else { + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); + + pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n", + pfn, msg_page[huge], ret, &page->flags); + if (ret > 0) + ret = -EBUSY; + } + } else { + pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n", + pfn, msg_page[huge], page_count(page), &page->flags); + ret = -EBUSY; + } + return ret; +} + +static void put_ref_page(struct page *page) +{ + if (page) + put_page(page); +} + +/** + * soft_offline_page - Soft offline a page. + * @pfn: pfn to soft-offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success + * -EOPNOTSUPP for hwpoison_filter() filtered the error event + * < 0 otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(unsigned long pfn, int flags) +{ + int ret; + bool try_again = true; + struct page *page, *ref_page = NULL; + + WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); + + if (!pfn_valid(pfn)) + return -ENXIO; + if (flags & MF_COUNT_INCREASED) + ref_page = pfn_to_page(pfn); + + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ + page = pfn_to_online_page(pfn); + if (!page) { + put_ref_page(ref_page); + return -EIO; + } + + mutex_lock(&mf_mutex); + + if (PageHWPoison(page)) { + pr_info("%s: %#lx page already poisoned\n", __func__, pfn); + put_ref_page(ref_page); + mutex_unlock(&mf_mutex); + return 0; + } + +retry: + get_online_mems(); + ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE); + put_online_mems(); + + if (hwpoison_filter(page)) { + if (ret > 0) + put_page(page); + + mutex_unlock(&mf_mutex); + return -EOPNOTSUPP; + } + + if (ret > 0) { + ret = soft_offline_in_use_page(page); + } else if (ret == 0) { + if (!page_handle_poison(page, true, false)) { + if (try_again) { + try_again = false; + flags &= ~MF_COUNT_INCREASED; + goto retry; + } + ret = -EBUSY; + } + } + + mutex_unlock(&mf_mutex); + + return ret; +} + +void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ + int i, total = 0; + + /* + * A further optimization is to have per section refcounted + * num_poisoned_pages. But that would need more space per memmap, so + * for now just do a quick global check to speed up this routine in the + * absence of bad pages. + */ + if (atomic_long_read(&num_poisoned_pages) == 0) + return; + + for (i = 0; i < nr_pages; i++) { + if (PageHWPoison(&memmap[i])) { + total++; + ClearPageHWPoison(&memmap[i]); + } + } + if (total) + num_poisoned_pages_sub(total); +} diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c new file mode 100644 index 000000000..ba863f467 --- /dev/null +++ b/mm/memory-tiers.c @@ -0,0 +1,732 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +struct memory_tier { + /* hierarchy of memory tiers */ + struct list_head list; + /* list of all memory types part of this tier */ + struct list_head memory_types; + /* + * start value of abstract distance. memory tier maps + * an abstract distance range, + * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE + */ + int adistance_start; + struct device dev; + /* All the nodes that are part of all the lower memory tiers. */ + nodemask_t lower_tier_mask; +}; + +struct demotion_nodes { + nodemask_t preferred; +}; + +struct node_memory_type_map { + struct memory_dev_type *memtype; + int map_count; +}; + +static DEFINE_MUTEX(memory_tier_lock); +static LIST_HEAD(memory_tiers); +static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; +static struct memory_dev_type *default_dram_type; + +static struct bus_type memory_tier_subsys = { + .name = "memory_tiering", + .dev_name = "memory_tier", +}; + +#ifdef CONFIG_MIGRATION +static int top_tier_adistance; +/* + * node_demotion[] examples: + * + * Example 1: + * + * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes. + * + * node distances: + * node 0 1 2 3 + * 0 10 20 30 40 + * 1 20 10 40 30 + * 2 30 40 10 40 + * 3 40 30 40 10 + * + * memory_tiers0 = 0-1 + * memory_tiers1 = 2-3 + * + * node_demotion[0].preferred = 2 + * node_demotion[1].preferred = 3 + * node_demotion[2].preferred = + * node_demotion[3].preferred = + * + * Example 2: + * + * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node. + * + * node distances: + * node 0 1 2 + * 0 10 20 30 + * 1 20 10 30 + * 2 30 30 10 + * + * memory_tiers0 = 0-2 + * + * node_demotion[0].preferred = + * node_demotion[1].preferred = + * node_demotion[2].preferred = + * + * Example 3: + * + * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node. + * + * node distances: + * node 0 1 2 + * 0 10 20 30 + * 1 20 10 40 + * 2 30 40 10 + * + * memory_tiers0 = 1 + * memory_tiers1 = 0 + * memory_tiers2 = 2 + * + * node_demotion[0].preferred = 2 + * node_demotion[1].preferred = 0 + * node_demotion[2].preferred = + * + */ +static struct demotion_nodes *node_demotion __read_mostly; +#endif /* CONFIG_MIGRATION */ + +static inline struct memory_tier *to_memory_tier(struct device *device) +{ + return container_of(device, struct memory_tier, dev); +} + +static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier) +{ + nodemask_t nodes = NODE_MASK_NONE; + struct memory_dev_type *memtype; + + list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) + nodes_or(nodes, nodes, memtype->nodes); + + return nodes; +} + +static void memory_tier_device_release(struct device *dev) +{ + struct memory_tier *tier = to_memory_tier(dev); + /* + * synchronize_rcu in clear_node_memory_tier makes sure + * we don't have rcu access to this memory tier. + */ + kfree(tier); +} + +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int ret; + nodemask_t nmask; + + mutex_lock(&memory_tier_lock); + nmask = get_memtier_nodemask(to_memory_tier(dev)); + ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask)); + mutex_unlock(&memory_tier_lock); + return ret; +} +static DEVICE_ATTR_RO(nodelist); + +static struct attribute *memtier_dev_attrs[] = { + &dev_attr_nodelist.attr, + NULL +}; + +static const struct attribute_group memtier_dev_group = { + .attrs = memtier_dev_attrs, +}; + +static const struct attribute_group *memtier_dev_groups[] = { + &memtier_dev_group, + NULL +}; + +static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype) +{ + int ret; + bool found_slot = false; + struct memory_tier *memtier, *new_memtier; + int adistance = memtype->adistance; + unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE; + + lockdep_assert_held_once(&memory_tier_lock); + + adistance = round_down(adistance, memtier_adistance_chunk_size); + /* + * If the memtype is already part of a memory tier, + * just return that. + */ + if (!list_empty(&memtype->tier_sibiling)) { + list_for_each_entry(memtier, &memory_tiers, list) { + if (adistance == memtier->adistance_start) + return memtier; + } + WARN_ON(1); + return ERR_PTR(-EINVAL); + } + + list_for_each_entry(memtier, &memory_tiers, list) { + if (adistance == memtier->adistance_start) { + goto link_memtype; + } else if (adistance < memtier->adistance_start) { + found_slot = true; + break; + } + } + + new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL); + if (!new_memtier) + return ERR_PTR(-ENOMEM); + + new_memtier->adistance_start = adistance; + INIT_LIST_HEAD(&new_memtier->list); + INIT_LIST_HEAD(&new_memtier->memory_types); + if (found_slot) + list_add_tail(&new_memtier->list, &memtier->list); + else + list_add_tail(&new_memtier->list, &memory_tiers); + + new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS; + new_memtier->dev.bus = &memory_tier_subsys; + new_memtier->dev.release = memory_tier_device_release; + new_memtier->dev.groups = memtier_dev_groups; + + ret = device_register(&new_memtier->dev); + if (ret) { + list_del(&new_memtier->list); + put_device(&new_memtier->dev); + return ERR_PTR(ret); + } + memtier = new_memtier; + +link_memtype: + list_add(&memtype->tier_sibiling, &memtier->memory_types); + return memtier; +} + +static struct memory_tier *__node_get_memory_tier(int node) +{ + pg_data_t *pgdat; + + pgdat = NODE_DATA(node); + if (!pgdat) + return NULL; + /* + * Since we hold memory_tier_lock, we can avoid + * RCU read locks when accessing the details. No + * parallel updates are possible here. + */ + return rcu_dereference_check(pgdat->memtier, + lockdep_is_held(&memory_tier_lock)); +} + +#ifdef CONFIG_MIGRATION +bool node_is_toptier(int node) +{ + bool toptier; + pg_data_t *pgdat; + struct memory_tier *memtier; + + pgdat = NODE_DATA(node); + if (!pgdat) + return false; + + rcu_read_lock(); + memtier = rcu_dereference(pgdat->memtier); + if (!memtier) { + toptier = true; + goto out; + } + if (memtier->adistance_start <= top_tier_adistance) + toptier = true; + else + toptier = false; +out: + rcu_read_unlock(); + return toptier; +} + +void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets) +{ + struct memory_tier *memtier; + + /* + * pg_data_t.memtier updates includes a synchronize_rcu() + * which ensures that we either find NULL or a valid memtier + * in NODE_DATA. protect the access via rcu_read_lock(); + */ + rcu_read_lock(); + memtier = rcu_dereference(pgdat->memtier); + if (memtier) + *targets = memtier->lower_tier_mask; + else + *targets = NODE_MASK_NONE; + rcu_read_unlock(); +} + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + struct demotion_nodes *nd; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + target = node_random(&nd->preferred); + rcu_read_unlock(); + + return target; +} + +static void disable_all_demotion_targets(void) +{ + struct memory_tier *memtier; + int node; + + for_each_node_state(node, N_MEMORY) { + node_demotion[node].preferred = NODE_MASK_NONE; + /* + * We are holding memory_tier_lock, it is safe + * to access pgda->memtier. + */ + memtier = __node_get_memory_tier(node); + if (memtier) + memtier->lower_tier_mask = NODE_MASK_NONE; + } + /* + * Ensure that the "disable" is visible across the system. + * Readers will see either a combination of before+disable + * state or disable+after. They will never see before and + * after state together. + */ + synchronize_rcu(); +} + +/* + * Find an automatic demotion target for all memory + * nodes. Failing here is OK. It might just indicate + * being at the end of a chain. + */ +static void establish_demotion_targets(void) +{ + struct memory_tier *memtier; + struct demotion_nodes *nd; + int target = NUMA_NO_NODE, node; + int distance, best_distance; + nodemask_t tier_nodes, lower_tier; + + lockdep_assert_held_once(&memory_tier_lock); + + if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION)) + return; + + disable_all_demotion_targets(); + + for_each_node_state(node, N_MEMORY) { + best_distance = -1; + nd = &node_demotion[node]; + + memtier = __node_get_memory_tier(node); + if (!memtier || list_is_last(&memtier->list, &memory_tiers)) + continue; + /* + * Get the lower memtier to find the demotion node list. + */ + memtier = list_next_entry(memtier, list); + tier_nodes = get_memtier_nodemask(memtier); + /* + * find_next_best_node, use 'used' nodemask as a skip list. + * Add all memory nodes except the selected memory tier + * nodelist to skip list so that we find the best node from the + * memtier nodelist. + */ + nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes); + + /* + * Find all the nodes in the memory tier node list of same best distance. + * add them to the preferred mask. We randomly select between nodes + * in the preferred mask when allocating pages during demotion. + */ + do { + target = find_next_best_node(node, &tier_nodes); + if (target == NUMA_NO_NODE) + break; + + distance = node_distance(node, target); + if (distance == best_distance || best_distance == -1) { + best_distance = distance; + node_set(target, nd->preferred); + } else { + break; + } + } while (1); + } + /* + * Promotion is allowed from a memory tier to higher + * memory tier only if the memory tier doesn't include + * compute. We want to skip promotion from a memory tier, + * if any node that is part of the memory tier have CPUs. + * Once we detect such a memory tier, we consider that tier + * as top tiper from which promotion is not allowed. + */ + list_for_each_entry_reverse(memtier, &memory_tiers, list) { + tier_nodes = get_memtier_nodemask(memtier); + nodes_and(tier_nodes, node_states[N_CPU], tier_nodes); + if (!nodes_empty(tier_nodes)) { + /* + * abstract distance below the max value of this memtier + * is considered toptier. + */ + top_tier_adistance = memtier->adistance_start + + MEMTIER_CHUNK_SIZE - 1; + break; + } + } + /* + * Now build the lower_tier mask for each node collecting node mask from + * all memory tier below it. This allows us to fallback demotion page + * allocation to a set of nodes that is closer the above selected + * perferred node. + */ + lower_tier = node_states[N_MEMORY]; + list_for_each_entry(memtier, &memory_tiers, list) { + /* + * Keep removing current tier from lower_tier nodes, + * This will remove all nodes in current and above + * memory tier from the lower_tier mask. + */ + tier_nodes = get_memtier_nodemask(memtier); + nodes_andnot(lower_tier, lower_tier, tier_nodes); + memtier->lower_tier_mask = lower_tier; + } +} + +#else +static inline void disable_all_demotion_targets(void) {} +static inline void establish_demotion_targets(void) {} +#endif /* CONFIG_MIGRATION */ + +static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype) +{ + if (!node_memory_types[node].memtype) + node_memory_types[node].memtype = memtype; + /* + * for each device getting added in the same NUMA node + * with this specific memtype, bump the map count. We + * Only take memtype device reference once, so that + * changing a node memtype can be done by droping the + * only reference count taken here. + */ + + if (node_memory_types[node].memtype == memtype) { + if (!node_memory_types[node].map_count++) + kref_get(&memtype->kref); + } +} + +static struct memory_tier *set_node_memory_tier(int node) +{ + struct memory_tier *memtier; + struct memory_dev_type *memtype; + pg_data_t *pgdat = NODE_DATA(node); + + + lockdep_assert_held_once(&memory_tier_lock); + + if (!node_state(node, N_MEMORY)) + return ERR_PTR(-EINVAL); + + __init_node_memory_type(node, default_dram_type); + + memtype = node_memory_types[node].memtype; + node_set(node, memtype->nodes); + memtier = find_create_memory_tier(memtype); + if (!IS_ERR(memtier)) + rcu_assign_pointer(pgdat->memtier, memtier); + return memtier; +} + +static void destroy_memory_tier(struct memory_tier *memtier) +{ + list_del(&memtier->list); + device_unregister(&memtier->dev); +} + +static bool clear_node_memory_tier(int node) +{ + bool cleared = false; + pg_data_t *pgdat; + struct memory_tier *memtier; + + pgdat = NODE_DATA(node); + if (!pgdat) + return false; + + /* + * Make sure that anybody looking at NODE_DATA who finds + * a valid memtier finds memory_dev_types with nodes still + * linked to the memtier. We achieve this by waiting for + * rcu read section to finish using synchronize_rcu. + * This also enables us to free the destroyed memory tier + * with kfree instead of kfree_rcu + */ + memtier = __node_get_memory_tier(node); + if (memtier) { + struct memory_dev_type *memtype; + + rcu_assign_pointer(pgdat->memtier, NULL); + synchronize_rcu(); + memtype = node_memory_types[node].memtype; + node_clear(node, memtype->nodes); + if (nodes_empty(memtype->nodes)) { + list_del_init(&memtype->tier_sibiling); + if (list_empty(&memtier->memory_types)) + destroy_memory_tier(memtier); + } + cleared = true; + } + return cleared; +} + +static void release_memtype(struct kref *kref) +{ + struct memory_dev_type *memtype; + + memtype = container_of(kref, struct memory_dev_type, kref); + kfree(memtype); +} + +struct memory_dev_type *alloc_memory_type(int adistance) +{ + struct memory_dev_type *memtype; + + memtype = kmalloc(sizeof(*memtype), GFP_KERNEL); + if (!memtype) + return ERR_PTR(-ENOMEM); + + memtype->adistance = adistance; + INIT_LIST_HEAD(&memtype->tier_sibiling); + memtype->nodes = NODE_MASK_NONE; + kref_init(&memtype->kref); + return memtype; +} +EXPORT_SYMBOL_GPL(alloc_memory_type); + +void destroy_memory_type(struct memory_dev_type *memtype) +{ + kref_put(&memtype->kref, release_memtype); +} +EXPORT_SYMBOL_GPL(destroy_memory_type); + +void init_node_memory_type(int node, struct memory_dev_type *memtype) +{ + + mutex_lock(&memory_tier_lock); + __init_node_memory_type(node, memtype); + mutex_unlock(&memory_tier_lock); +} +EXPORT_SYMBOL_GPL(init_node_memory_type); + +void clear_node_memory_type(int node, struct memory_dev_type *memtype) +{ + mutex_lock(&memory_tier_lock); + if (node_memory_types[node].memtype == memtype) + node_memory_types[node].map_count--; + /* + * If we umapped all the attached devices to this node, + * clear the node memory type. + */ + if (!node_memory_types[node].map_count) { + node_memory_types[node].memtype = NULL; + kref_put(&memtype->kref, release_memtype); + } + mutex_unlock(&memory_tier_lock); +} +EXPORT_SYMBOL_GPL(clear_node_memory_type); + +static int __meminit memtier_hotplug_callback(struct notifier_block *self, + unsigned long action, void *_arg) +{ + struct memory_tier *memtier; + struct memory_notify *arg = _arg; + + /* + * Only update the node migration order when a node is + * changing status, like online->offline. + */ + if (arg->status_change_nid < 0) + return notifier_from_errno(0); + + switch (action) { + case MEM_OFFLINE: + mutex_lock(&memory_tier_lock); + if (clear_node_memory_tier(arg->status_change_nid)) + establish_demotion_targets(); + mutex_unlock(&memory_tier_lock); + break; + case MEM_ONLINE: + mutex_lock(&memory_tier_lock); + memtier = set_node_memory_tier(arg->status_change_nid); + if (!IS_ERR(memtier)) + establish_demotion_targets(); + mutex_unlock(&memory_tier_lock); + break; + } + + return notifier_from_errno(0); +} + +static int __init memory_tier_init(void) +{ + int ret, node; + struct memory_tier *memtier; + + ret = subsys_virtual_register(&memory_tier_subsys, NULL); + if (ret) + panic("%s() failed to register memory tier subsystem\n", __func__); + +#ifdef CONFIG_MIGRATION + node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); +#endif + mutex_lock(&memory_tier_lock); + /* + * For now we can have 4 faster memory tiers with smaller adistance + * than default DRAM tier. + */ + default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); + if (!default_dram_type) + panic("%s() failed to allocate default DRAM tier\n", __func__); + + /* + * Look at all the existing N_MEMORY nodes and add them to + * default memory tier or to a tier if we already have memory + * types assigned. + */ + for_each_node_state(node, N_MEMORY) { + memtier = set_node_memory_tier(node); + if (IS_ERR(memtier)) + /* + * Continue with memtiers we are able to setup + */ + break; + } + establish_demotion_targets(); + mutex_unlock(&memory_tier_lock); + + hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); + return 0; +} +subsys_initcall(memory_tier_init); + +bool numa_demotion_enabled = false; + +#ifdef CONFIG_MIGRATION +#ifdef CONFIG_SYSFS +static ssize_t numa_demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + numa_demotion_enabled ? "true" : "false"); +} + +static ssize_t numa_demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &numa_demotion_enabled); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute numa_demotion_enabled_attr = + __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, + numa_demotion_enabled_store); + +static struct attribute *numa_attrs[] = { + &numa_demotion_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group numa_attr_group = { + .attrs = numa_attrs, +}; + +static int __init numa_init_sysfs(void) +{ + int err; + struct kobject *numa_kobj; + + numa_kobj = kobject_create_and_add("numa", mm_kobj); + if (!numa_kobj) { + pr_err("failed to create numa kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(numa_kobj, &numa_attr_group); + if (err) { + pr_err("failed to register numa group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(numa_kobj); + return err; +} +subsys_initcall(numa_init_sysfs); +#endif /* CONFIG_SYSFS */ +#endif diff --git a/mm/memory.c b/mm/memory.c new file mode 100644 index 000000000..fc8b264ec --- /dev/null +++ b/mm/memory.c @@ -0,0 +1,6018 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + * + * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "pgalloc-track.h" +#include "internal.h" +#include "swap.h" + +#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. +#endif + +#ifndef CONFIG_NUMA +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; +EXPORT_SYMBOL(mem_map); +#endif + +static vm_fault_t do_fault(struct vm_fault *vmf); + +/* + * A number of key systems in x86 including ioremap() rely on the assumption + * that high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and + * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ +void *high_memory; +EXPORT_SYMBOL(high_memory); + +/* + * Randomize the address space (stacks, mmaps, brk, etc.). + * + * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, + * as ancient (libc5 based) binaries can segfault. ) + */ +int randomize_va_space __read_mostly = +#ifdef CONFIG_COMPAT_BRK + 1; +#else + 2; +#endif + +#ifndef arch_wants_old_prefaulted_pte +static inline bool arch_wants_old_prefaulted_pte(void) +{ + /* + * Transitioning a PTE from 'old' to 'young' can be expensive on + * some architectures, even if it's performed in hardware. By + * default, "false" means prefaulted entries will be 'young'. + */ + return false; +} +#endif + +static int __init disable_randmaps(char *s) +{ + randomize_va_space = 0; + return 1; +} +__setup("norandmaps", disable_randmaps); + +unsigned long zero_pfn __read_mostly; +EXPORT_SYMBOL(zero_pfn); + +unsigned long highest_memmap_pfn __read_mostly; + +/* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +early_initcall(init_zero_pfn); + +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +{ + trace_rss_stat(mm, member, count); +} + +#if defined(SPLIT_RSS_COUNTING) + +void sync_mm_rss(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (current->rss_stat.count[i]) { + add_mm_counter(mm, i, current->rss_stat.count[i]); + current->rss_stat.count[i] = 0; + } + } + current->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + sync_mm_rss(task->mm); +} +#else /* SPLIT_RSS_COUNTING */ + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +#endif /* SPLIT_RSS_COUNTING */ + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + pgtable_t token = pmd_pgtable(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, token, addr); + mm_dec_nr_ptes(tlb->mm); +} + +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + continue; + free_pte_range(tlb, pmd, addr); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); +} + +static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + free_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= P4D_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= P4D_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(p4d, start); + p4d_clear(p4d); + pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); +} + +static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + p4d_t *p4d; + unsigned long next; + unsigned long start; + + start = addr; + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + free_pud_range(tlb, p4d, addr, next, floor, ceiling); + } while (p4d++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + p4d = p4d_offset(pgd, start); + pgd_clear(pgd); + p4d_free_tlb(tlb, p4d, start); +} + +/* + * This function frees user-level page tables of a process. + */ +void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_change_page_size(tlb, PAGE_SIZE); + pgd = pgd_offset(tlb->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + free_p4d_range(tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); +} + +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) +{ + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + do { + unsigned long addr = vma->vm_start; + struct vm_area_struct *next; + + /* + * Note: USER_PGTABLES_CEILING may be passed as ceiling and may + * be 0. This will underflow and is okay. + */ + next = mas_find(&mas, ceiling - 1); + + /* + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables + */ + unlink_anon_vmas(vma); + unlink_file_vma(vma); + + if (is_vm_hugetlb_page(vma)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_vm_hugetlb_page(next)) { + vma = next; + next = mas_find(&mas, ceiling - 1); + unlink_anon_vmas(vma); + unlink_file_vma(vma); + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next ? next->vm_start : ceiling); + } + vma = next; + } while (vma); +} + +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) +{ + spinlock_t *ptl = pmd_lock(mm, pmd); + + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + mm_inc_nr_ptes(mm); + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_rmb() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ + pmd_populate(mm, pmd, *pte); + *pte = NULL; + } + spin_unlock(ptl); +} + +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t new = pte_alloc_one(mm); + if (!new) + return -ENOMEM; + + pmd_install(mm, pmd, &new); + if (new) + pte_free(mm, new); + return 0; +} + +int __pte_alloc_kernel(pmd_t *pmd) +{ + pte_t *new = pte_alloc_one_kernel(&init_mm); + if (!new) + return -ENOMEM; + + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + smp_wmb(); /* See comment in pmd_install() */ + pmd_populate_kernel(&init_mm, pmd, new); + new = NULL; + } + spin_unlock(&init_mm.page_table_lock); + if (new) + pte_free_kernel(&init_mm, new); + return 0; +} + +static inline void init_rss_vec(int *rss) +{ + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + if (current->mm == mm) + sync_mm_rss(mm); + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); +} + +/* + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. + * + * The calling function must still handle the error. + */ +static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, + pte_t pte, struct page *page) +{ + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + struct address_space *mapping; + pgoff_t index; + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + return; + } + if (nr_unshown) { + pr_alert("BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; + index = linear_page_index(vma, addr); + + pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); + if (page) + dump_page(page, "bad pte"); + pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", + vma->vm_file, + vma->vm_ops ? vma->vm_ops->fault : NULL, + vma->vm_file ? vma->vm_file->f_op->mmap : NULL, + mapping ? mapping->a_ops->read_folio : NULL); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} + +/* + * vm_normal_page -- This function gets the "struct page" associated with a pte. + * + * "Special" mappings do not wish to be associated with a "struct page" (either + * it doesn't exist, or it exists but they don't want to touch it). In this + * case, NULL is returned here. "Normal" mappings do have a struct page. + * + * There are 2 broad cases. Firstly, an architecture may define a pte_special() + * pte bit, in which case this function is trivial. Secondly, an architecture + * may not have a spare pte bit, which requires a more complicated scheme, + * described below. + * + * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a + * special mapping (even if there are underlying and valid "struct pages"). + * COWed pages of a VM_PFNMAP are always normal. + * + * The way we recognize COWed pages within VM_PFNMAP mappings is through the + * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit + * set, and the vm_pgoff will point to the first PFN mapped: thus every special + * mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * And for normal mappings this is false. + * + * This restricts such mappings to be a linear translation from virtual address + * to pfn. To get around this restriction, we allow arbitrary mappings so long + * as the vma is not a COW mapping; in that case, we know that all ptes are + * special (because none can have been COWed). + * + * + * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. + * + * VM_MIXEDMAP mappings can likewise contain memory with or without "struct + * page" backing, however the difference is that _all_ pages with a struct + * page (that is, those where pfn_valid is true) are refcounted and considered + * normal pages by the VM. The disadvantage is that pages are refcounted + * (which can be slower and simply not an option for some PFNMAP users). The + * advantage is that we don't have to follow the strict linearity rule of + * PFNMAP mappings in order to support COWable mappings. + * + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { + if (likely(!pte_special(pte))) + goto check_pfn; + if (vma->vm_ops && vma->vm_ops->find_special_page) + return vma->vm_ops->find_special_page(vma, addr); + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (is_zero_pfn(pfn)) + return NULL; + if (pte_devmap(pte)) + /* + * NOTE: New users of ZONE_DEVICE will not set pte_devmap() + * and will have refcounts incremented on their struct pages + * when they are inserted into PTEs, thus they are safe to + * return here. Legacy ZONE_DEVICE pages that set pte_devmap() + * do not have refcounts. Example of legacy ZONE_DEVICE is + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + */ + return NULL; + + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } + + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ + + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + +check_pfn: + if (unlikely(pfn > highest_memmap_pfn)) { + print_bad_pte(vma, addr, pte, NULL); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} + +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) +{ + struct page *page = vm_normal_page(vma, addr, pte); + + if (page) + return page_folio(page); + return NULL; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (pmd_devmap(pmd)) + return NULL; + if (is_huge_zero_pmd(pmd)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + +static void restore_exclusive_pte(struct vm_area_struct *vma, + struct page *page, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + swp_entry_t entry; + + pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (pte_swp_soft_dirty(*ptep)) + pte = pte_mksoft_dirty(pte); + + entry = pte_to_swp_entry(*ptep); + if (pte_swp_uffd_wp(*ptep)) + pte = pte_mkuffd_wp(pte); + else if (is_writable_device_exclusive_entry(entry)) + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + + VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page))); + + /* + * No need to take a page reference as one was already + * created when the swap entry was made. + */ + if (PageAnon(page)) + page_add_anon_rmap(page, vma, address, RMAP_NONE); + else + /* + * Currently device exclusive access only supports anonymous + * memory so the entry shouldn't point to a filebacked page. + */ + WARN_ON_ONCE(1); + + set_pte_at(vma->vm_mm, address, ptep, pte); + + /* + * No need to invalidate - it was non-present before. However + * secondary CPUs may have mappings that need invalidating. + */ + update_mmu_cache(vma, address, ptep); +} + +/* + * Tries to restore an exclusive pte if the page lock can be acquired without + * sleeping. + */ +static int +try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma, + unsigned long addr) +{ + swp_entry_t entry = pte_to_swp_entry(*src_pte); + struct page *page = pfn_swap_entry_to_page(entry); + + if (trylock_page(page)) { + restore_exclusive_pte(vma, page, addr, src_pte); + unlock_page(page); + return 0; + } + + return -EBUSY; +} + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + */ + +static unsigned long +copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, unsigned long addr, int *rss) +{ + unsigned long vm_flags = dst_vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return -EIO; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + /* Mark the swap entry as shared. */ + if (pte_swp_exclusive(*src_pte)) { + pte = pte_swp_clear_exclusive(*src_pte); + set_pte_at(src_mm, addr, src_pte, pte); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + + rss[mm_counter(page)]++; + + if (!is_readable_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + /* + * COW mappings require pages in both parent and child + * to be set to read. A previously exclusive entry is + * now shared. + */ + entry = make_readable_migration_entry( + swp_offset(entry)); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); + set_pte_at(src_mm, addr, src_pte, pte); + } + } else if (is_device_private_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + /* Cannot fail as these pages cannot get pinned. */ + BUG_ON(page_try_dup_anon_rmap(page, false, src_vma)); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_writable_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + entry = make_readable_device_private_entry( + swp_offset(entry)); + pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); + set_pte_at(src_mm, addr, src_pte, pte); + } + } else if (is_device_exclusive_entry(entry)) { + /* + * Make device exclusive entries present by restoring the + * original entry then copying as for a present pte. Device + * exclusive entries currently only support private writable + * (ie. COW) mappings. + */ + VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags)); + if (try_restore_exclusive_pte(src_pte, src_vma, addr)) + return -EBUSY; + return -ENOENT; + } else if (is_pte_marker_entry(entry)) { + if (userfaultfd_wp(dst_vma)) + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; + } + if (!userfaultfd_wp(dst_vma)) + pte = pte_swp_clear_uffd_wp(pte); + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy a present and normal page. + * + * NOTE! The usual case is that this isn't required; + * instead, the caller can just increase the page refcount + * and re-use the pte the traditional way. + * + * And if we need a pre-allocated page but don't yet have + * one, return a negative error to let the preallocation + * code know so that it can do so outside the page table + * lock. + */ +static inline int +copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc, struct page *page) +{ + struct page *new_page; + pte_t pte; + + new_page = *prealloc; + if (!new_page) + return -EAGAIN; + + /* + * We have a prealloc page, all good! Take it + * over and copy the page & arm it. + */ + *prealloc = NULL; + copy_user_highpage(new_page, page, addr, src_vma); + __SetPageUptodate(new_page); + page_add_new_anon_rmap(new_page, dst_vma, addr); + lru_cache_add_inactive_or_unevictable(new_page, dst_vma); + rss[mm_counter(new_page)]++; + + /* All done, just insert the new page copy in the child */ + pte = mk_pte(new_page, dst_vma->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); + if (userfaultfd_pte_wp(dst_vma, *src_pte)) + /* Uffd-wp needs to be delivered to dest pte as well */ + pte = pte_wrprotect(pte_mkuffd_wp(pte)); + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page + * is required to copy this pte. + */ +static inline int +copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, + struct page **prealloc) +{ + struct mm_struct *src_mm = src_vma->vm_mm; + unsigned long vm_flags = src_vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + page = vm_normal_page(src_vma, addr, pte); + if (page && PageAnon(page)) { + /* + * If this page may have been pinned by the parent process, + * copy the page immediately for the child so that we'll always + * guarantee the pinned page won't be randomly replaced in the + * future. + */ + get_page(page); + if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { + /* Page maybe pinned, we have to copy. */ + put_page(page); + return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, prealloc, page); + } + rss[mm_counter(page)]++; + } else if (page) { + get_page(page); + page_dup_file_rmap(page, false); + rss[mm_counter(page)]++; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if (is_cow_mapping(vm_flags) && pte_write(pte)) { + ptep_set_wrprotect(src_mm, addr, src_pte); + pte = pte_wrprotect(pte); + } + VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page)); + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + + if (!userfaultfd_wp(dst_vma)) + pte = pte_clear_uffd_wp(pte); + + set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + return 0; +} + +static inline struct page * +page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); + if (!new_page) + return NULL; + + if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) { + put_page(new_page); + return NULL; + } + cgroup_throttle_swaprate(new_page, GFP_KERNEL); + + return new_page; +} + +static int +copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + unsigned long end) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + pte_t *orig_src_pte, *orig_dst_pte; + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + int progress, ret = 0; + int rss[NR_MM_COUNTERS]; + swp_entry_t entry = (swp_entry_t){0}; + struct page *prealloc = NULL; + +again: + progress = 0; + init_rss_vec(rss); + + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } + src_pte = pte_offset_map(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + orig_src_pte = src_pte; + orig_dst_pte = dst_pte; + arch_enter_lazy_mmu_mode(); + + do { + /* + * We are holding two locks at this point - either of them + * could generate latencies in another task on another CPU. + */ + if (progress >= 32) { + progress = 0; + if (need_resched() || + spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) + break; + } + if (pte_none(*src_pte)) { + progress++; + continue; + } + if (unlikely(!pte_present(*src_pte))) { + ret = copy_nonpresent_pte(dst_mm, src_mm, + dst_pte, src_pte, + dst_vma, src_vma, + addr, rss); + if (ret == -EIO) { + entry = pte_to_swp_entry(*src_pte); + break; + } else if (ret == -EBUSY) { + break; + } else if (!ret) { + progress += 8; + continue; + } + + /* + * Device exclusive entry restored, continue by copying + * the now present pte. + */ + WARN_ON_ONCE(ret != -ENOENT); + } + /* copy_present_pte() will clear `*prealloc' if consumed */ + ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, + addr, rss, &prealloc); + /* + * If we need a pre-allocated page for this pte, drop the + * locks, allocate, and try again. + */ + if (unlikely(ret == -EAGAIN)) + break; + if (unlikely(prealloc)) { + /* + * pre-alloc page cannot be reused by next time so as + * to strictly follow mempolicy (e.g., alloc_page_vma() + * will allocate page according to address). This + * could only happen if one pinned pte changed. + */ + put_page(prealloc); + prealloc = NULL; + } + progress += 8; + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + spin_unlock(src_ptl); + pte_unmap(orig_src_pte); + add_mm_rss_vec(dst_mm, rss); + pte_unmap_unlock(orig_dst_pte, dst_ptl); + cond_resched(); + + if (ret == -EIO) { + VM_WARN_ON_ONCE(!entry.val); + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret == -EBUSY) { + goto out; + } else if (ret == -EAGAIN) { + prealloc = page_copy_prealloc(src_mm, src_vma, addr); + if (!prealloc) + return -ENOMEM; + } else if (ret) { + VM_WARN_ON_ONCE(1); + } + + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; + + if (addr != end) + goto again; +out: + if (unlikely(prealloc)) + put_page(prealloc); + return ret; +} + +static inline int +copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pud_t *dst_pud, pud_t *src_pud, unsigned long addr, + unsigned long end) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + pmd_t *src_pmd, *dst_pmd; + unsigned long next; + + dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + if (!dst_pmd) + return -ENOMEM; + src_pmd = pmd_offset(src_pud, addr); + do { + next = pmd_addr_end(addr, end); + if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) + || pmd_devmap(*src_pmd)) { + int err; + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); + err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, + addr, dst_vma, src_vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } + if (pmd_none_or_clear_bad(src_pmd)) + continue; + if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, + addr, next)) + return -ENOMEM; + } while (dst_pmd++, src_pmd++, addr = next, addr != end); + return 0; +} + +static inline int +copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, + unsigned long end) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + pud_t *src_pud, *dst_pud; + unsigned long next; + + dst_pud = pud_alloc(dst_mm, dst_p4d, addr); + if (!dst_pud) + return -ENOMEM; + src_pud = pud_offset(src_p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + int err; + + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); + err = copy_huge_pud(dst_mm, src_mm, + dst_pud, src_pud, addr, src_vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } + if (pud_none_or_clear_bad(src_pud)) + continue; + if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, + addr, next)) + return -ENOMEM; + } while (dst_pud++, src_pud++, addr = next, addr != end); + return 0; +} + +static inline int +copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, + unsigned long end) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + p4d_t *src_p4d, *dst_p4d; + unsigned long next; + + dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); + if (!dst_p4d) + return -ENOMEM; + src_p4d = p4d_offset(src_pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(src_p4d)) + continue; + if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, + addr, next)) + return -ENOMEM; + } while (dst_p4d++, src_p4d++, addr = next, addr != end); + return 0; +} + +/* + * Return true if the vma needs to copy the pgtable during this fork(). Return + * false when we can speed up fork() by allowing lazy page faults later until + * when the child accesses the memory range. + */ +static bool +vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) +{ + /* + * Always copy pgtables when dst_vma has uffd-wp enabled even if it's + * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable + * contains uffd-wp protection information, that's something we can't + * retrieve from page cache, and skip copying will lose those info. + */ + if (userfaultfd_wp(dst_vma)) + return true; + + if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return true; + + if (src_vma->anon_vma) + return true; + + /* + * Don't copy ptes where a page fault will fill them correctly. Fork + * becomes much lighter when there are big shared or private readonly + * mappings. The tradeoff is that copy_page_range is more efficient + * than faulting. + */ + return false; +} + +int +copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) +{ + pgd_t *src_pgd, *dst_pgd; + unsigned long next; + unsigned long addr = src_vma->vm_start; + unsigned long end = src_vma->vm_end; + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + struct mmu_notifier_range range; + bool is_cow; + int ret; + + if (!vma_needs_copy(dst_vma, src_vma)) + return 0; + + if (is_vm_hugetlb_page(src_vma)) + return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma); + + if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { + /* + * We do not free on error cases below as remove_vma + * gets called on error from higher level routine + */ + ret = track_pfn_copy(src_vma); + if (ret) + return ret; + } + + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + is_cow = is_cow_mapping(src_vma->vm_flags); + + if (is_cow) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, src_vma, src_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + /* + * Disabling preemption is not needed for the write side, as + * the read side doesn't spin, but goes to the mmap_lock. + * + * Use the raw variant of the seqcount_t write API to avoid + * lockdep complaining about preemptibility. + */ + mmap_assert_write_locked(src_mm); + raw_write_seqcount_begin(&src_mm->write_protect_seq); + } + + ret = 0; + dst_pgd = pgd_offset(dst_mm, addr); + src_pgd = pgd_offset(src_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(src_pgd)) + continue; + if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, + addr, next))) { + ret = -ENOMEM; + break; + } + } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow) { + raw_write_seqcount_end(&src_mm->write_protect_seq); + mmu_notifier_invalidate_range_end(&range); + } + return ret; +} + +/* Whether we should zap all COWed (private) pages too */ +static inline bool should_zap_cows(struct zap_details *details) +{ + /* By default, zap all pages */ + if (!details) + return true; + + /* Or, we zap COWed pages only if the caller wants to */ + return details->even_cows; +} + +/* Decides whether we should zap this page with the page pointer specified */ +static inline bool should_zap_page(struct zap_details *details, struct page *page) +{ + /* If we can make a decision without *page.. */ + if (should_zap_cows(details)) + return true; + + /* E.g. the caller passes NULL for the case of a zero page */ + if (!page) + return true; + + /* Otherwise we should only zap non-anon pages */ + return !PageAnon(page); +} + +static inline bool zap_drop_file_uffd_wp(struct zap_details *details) +{ + if (!details) + return false; + + return details->zap_flags & ZAP_FLAG_DROP_MARKER; +} + +/* + * This function makes sure that we'll replace the none pte with an uffd-wp + * swap special pte marker when necessary. Must be with the pgtable lock held. + */ +static inline void +zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte, + struct zap_details *details, pte_t pteval) +{ +#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (zap_drop_file_uffd_wp(details)) + return; + + pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +#endif +} + +static unsigned long zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + struct mm_struct *mm = tlb->mm; + int force_flush = 0; + int rss[NR_MM_COUNTERS]; + spinlock_t *ptl; + pte_t *start_pte; + pte_t *pte; + swp_entry_t entry; + + tlb_change_page_size(tlb, PAGE_SIZE); +again: + init_rss_vec(rss); + start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = start_pte; + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + do { + pte_t ptent = *pte; + struct page *page; + + if (pte_none(ptent)) + continue; + + if (need_resched()) + break; + + if (pte_present(ptent)) { + page = vm_normal_page(vma, addr, ptent); + if (unlikely(!should_zap_page(details, page))) + continue; + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); + zap_install_uffd_wp_if_needed(vma, addr, pte, details, + ptent); + if (unlikely(!page)) + continue; + + if (!PageAnon(page)) { + if (pte_dirty(ptent)) { + force_flush = 1; + set_page_dirty(page); + } + if (pte_young(ptent) && + likely(!(vma->vm_flags & VM_SEQ_READ))) + mark_page_accessed(page); + } + rss[mm_counter(page)]--; + page_remove_rmap(page, vma, false); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + if (unlikely(__tlb_remove_page(tlb, page))) { + force_flush = 1; + addr += PAGE_SIZE; + break; + } + continue; + } + + entry = pte_to_swp_entry(ptent); + if (is_device_private_entry(entry) || + is_device_exclusive_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + if (unlikely(!should_zap_page(details, page))) + continue; + /* + * Both device private/exclusive mappings should only + * work with anonymous page so far, so we don't need to + * consider uffd-wp bit when zap. For more information, + * see zap_install_uffd_wp_if_needed(). + */ + WARN_ON_ONCE(!vma_is_anonymous(vma)); + rss[mm_counter(page)]--; + if (is_device_private_entry(entry)) + page_remove_rmap(page, vma, false); + put_page(page); + } else if (!non_swap_entry(entry)) { + /* Genuine swap entry, hence a private anon page */ + if (!should_zap_cows(details)) + continue; + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } else if (is_migration_entry(entry)) { + page = pfn_swap_entry_to_page(entry); + if (!should_zap_page(details, page)) + continue; + rss[mm_counter(page)]--; + } else if (pte_marker_entry_uffd_wp(entry)) { + /* Only drop the uffd-wp marker if explicitly requested */ + if (!zap_drop_file_uffd_wp(details)) + continue; + } else if (is_hwpoison_entry(entry) || + is_swapin_error_entry(entry)) { + if (!should_zap_cows(details)) + continue; + } else { + /* We should have covered all the swap entry types */ + WARN_ON_ONCE(1); + } + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); + } while (pte++, addr += PAGE_SIZE, addr != end); + + add_mm_rss_vec(mm, rss); + arch_leave_lazy_mmu_mode(); + + /* Do the actual TLB flush before dropping ptl */ + if (force_flush) + tlb_flush_mmu_tlbonly(tlb); + pte_unmap_unlock(start_pte, ptl); + + /* + * If we forced a TLB flush (either due to running out of + * batch buffers or because we needed to flush dirty TLB + * entries before releasing the ptl), free the batched + * memory too. Restart if we didn't do everything. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + } + + if (addr != end) { + cond_resched(); + goto again; + } + + return addr; +} + +static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + __split_huge_pmd(vma, pmd, addr, false, NULL); + else if (zap_huge_pmd(tlb, vma, pmd, addr)) + goto next; + /* fall through */ + } else if (details && details->single_folio && + folio_test_pmd_mappable(details->single_folio) && + next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + /* + * Take and drop THP pmd lock so that we cannot return + * prematurely, while zap_huge_pmd() has cleared *pmd, + * but not yet decremented compound_mapcount(). + */ + spin_unlock(ptl); + } + + /* + * Here there can be other concurrent MADV_DONTNEED or + * trans huge page faults running, and if the pmd is + * none or trans huge it can change under us. This is + * because MADV_DONTNEED holds the mmap_lock in read + * mode. + */ + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + goto next; + next = zap_pte_range(tlb, vma, pmd, addr, next, details); +next: + cond_resched(); + } while (pmd++, addr = next, addr != end); + + return addr; +} + +static inline unsigned long zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (next - addr != HPAGE_PUD_SIZE) { + mmap_assert_locked(tlb->mm); + split_huge_pud(vma, pud, addr); + } else if (zap_huge_pud(tlb, vma, pud, addr)) + goto next; + /* fall through */ + } + if (pud_none_or_clear_bad(pud)) + continue; + next = zap_pmd_range(tlb, vma, pud, addr, next, details); +next: + cond_resched(); + } while (pud++, addr = next, addr != end); + + return addr; +} + +static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + next = zap_pud_range(tlb, vma, p4d, addr, next, details); + } while (p4d++, addr = next, addr != end); + + return addr; +} + +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details) +{ + pgd_t *pgd; + unsigned long next; + + BUG_ON(addr >= end); + tlb_start_vma(tlb, vma); + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + next = zap_p4d_range(tlb, vma, pgd, addr, next, details); + } while (pgd++, addr = next, addr != end); + tlb_end_vma(tlb, vma); +} + + +static void unmap_single_vma(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, + struct zap_details *details) +{ + unsigned long start = max(vma->vm_start, start_addr); + unsigned long end; + + if (start >= vma->vm_end) + return; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return; + + if (vma->vm_file) + uprobe_munmap(vma, start, end); + + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn(vma, 0, 0); + + if (start != end) { + if (unlikely(is_vm_hugetlb_page(vma))) { + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of mmap_region. When + * hugetlbfs ->mmap method fails, + * mmap_region() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + zap_flags_t zap_flags = details ? + details->zap_flags : 0; + __unmap_hugepage_range_final(tlb, vma, start, end, + NULL, zap_flags); + } + } else + unmap_page_range(tlb, vma, start, end, details); + } +} + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlb: address of the caller's struct mmu_gather + * @mt: the maple tree + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * + * Unmap all pages in the vma list. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr) +{ + struct mmu_notifier_range range; + struct zap_details details = { + .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, + /* Careful - we need to zap private pages too! */ + .even_cows = true, + }; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); + do { + unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); + mmu_notifier_invalidate_range_end(&range); +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @start: starting address of pages to zap + * @size: number of bytes to zap + * + * Caller must protect the VMA list + */ +void zap_page_range(struct vm_area_struct *vma, unsigned long start, + unsigned long size) +{ + struct maple_tree *mt = &vma->vm_mm->mm_mt; + unsigned long end = start + size; + struct mmu_notifier_range range; + struct mmu_gather tlb; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + lru_add_drain(); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, start + size); + tlb_gather_mmu(&tlb, vma->vm_mm); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + do { + unmap_single_vma(&tlb, vma, start, range.end, NULL); + } while ((vma = mas_find(&mas, end - 1)) != NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); +} + +/** + * zap_page_range_single - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + * @details: details of shared cache invalidation + * + * The range must fit into one VMA. + */ +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + unsigned long size, struct zap_details *details) +{ + const unsigned long end = address + size; + struct mmu_notifier_range range; + struct mmu_gather tlb; + + lru_add_drain(); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, end); + if (is_vm_hugetlb_page(vma)) + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); + tlb_gather_mmu(&tlb, vma->vm_mm); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + /* + * unmap 'address-end' not 'range.start-range.end' as range + * could have been expanded for hugetlb pmd sharing. + */ + unmap_single_vma(&tlb, vma, address, end, details); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); +} + +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + */ +void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (!range_in_vma(vma, address, address + size) || + !(vma->vm_flags & VM_PFNMAP)) + return; + + zap_page_range_single(vma, address, size, NULL); +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + +static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return NULL; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + VM_BUG_ON(pmd_trans_huge(*pmd)); + return pmd; +} + +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pmd_t *pmd = walk_to_pmd(mm, addr); + + if (!pmd) + return NULL; + return pte_alloc_map_lock(mm, pmd, addr, ptl); +} + +static int validate_page_before_insert(struct page *page) +{ + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) + return -EINVAL; + flush_dcache_page(page); + return 0; +} + +static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, struct page *page, pgprot_t prot) +{ + if (!pte_none(*pte)) + return -EBUSY; + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); + return 0; +} + +/* + * This is the old fallback for page remapping. + * + * For historical reasons, it only allows reserved pages. Only + * old drivers should use this, and they needed to mark their + * pages reserved for the old functions anyway. + */ +static int insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = validate_page_before_insert(page); + if (retval) + goto out; + retval = -ENOMEM; + pte = get_locked_pte(vma->vm_mm, addr, &ptl); + if (!pte) + goto out; + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot); + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +#ifdef pte_index +static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr, struct page *page, pgprot_t prot) +{ + int err; + + if (!page_count(page)) + return -EINVAL; + err = validate_page_before_insert(page); + if (err) + return err; + return insert_page_into_pte_locked(vma, pte, addr, page, prot); +} + +/* insert_pages() amortizes the cost of spinlock operations + * when inserting pages in a loop. Arch *must* define pte_index. + */ +static int insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num, pgprot_t prot) +{ + pmd_t *pmd = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; + struct mm_struct *const mm = vma->vm_mm; + unsigned long curr_page_idx = 0; + unsigned long remaining_pages_total = *num; + unsigned long pages_to_write_in_pmd; + int ret; +more: + ret = -EFAULT; + pmd = walk_to_pmd(mm, addr); + if (!pmd) + goto out; + + pages_to_write_in_pmd = min_t(unsigned long, + remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); + + /* Allocate the PTE if necessary; takes PMD lock once only. */ + ret = -ENOMEM; + if (pte_alloc(mm, pmd)) + goto out; + + while (pages_to_write_in_pmd) { + int pte_idx = 0; + const int batch_size = min_t(int, pages_to_write_in_pmd, 8); + + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_in_batch_locked(vma, pte, + addr, pages[curr_page_idx], prot); + if (unlikely(err)) { + pte_unmap_unlock(start_pte, pte_lock); + ret = err; + remaining_pages_total -= pte_idx; + goto out; + } + addr += PAGE_SIZE; + ++curr_page_idx; + } + pte_unmap_unlock(start_pte, pte_lock); + pages_to_write_in_pmd -= batch_size; + remaining_pages_total -= batch_size; + } + if (remaining_pages_total) + goto more; + ret = 0; +out: + *num = remaining_pages_total; + return ret; +} +#endif /* ifdef pte_index */ + +/** + * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. + * @vma: user vma to map to + * @addr: target start user address of these pages + * @pages: source kernel pages + * @num: in: number of pages to map. out: number of pages that were *not* + * mapped. (0 means all pages were successfully mapped). + * + * Preferred over vm_insert_page() when inserting multiple pages. + * + * In case of error, we may have mapped a subset of the provided + * pages. It is the caller's responsibility to account for this case. + * + * The same restrictions apply as in vm_insert_page(). + */ +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ +#ifdef pte_index + const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + + if (addr < vma->vm_start || end_addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(mmap_read_trylock(vma->vm_mm)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } + /* Defer page refcount checking till we're about to map that page. */ + return insert_pages(vma, addr, pages, num, vma->vm_page_prot); +#else + unsigned long idx = 0, pgcount = *num; + int err = -EINVAL; + + for (; idx < pgcount; ++idx) { + err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); + if (err) + break; + } + *num = pgcount - idx; + return err; +#endif /* ifdef pte_index */ +} +EXPORT_SYMBOL(vm_insert_pages); + +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * + * This allows drivers to insert individual pages they've allocated + * into a user vma. + * + * The page has to be a nice clean _individual_ kernel allocation. + * If you allocate a compound page, you need to have marked it as + * such (__GFP_COMP), or manually just split the page up yourself + * (see split_page()). + * + * NOTE! Traditionally this was done with "remap_pfn_range()" which + * took an arbitrary page protection parameter. This doesn't allow + * that. Your vma protection will have to be set up correctly, which + * means that if you want a shared writable mapping, you'd better + * ask for a shared writable mapping! + * + * The page does not need to be reserved. + * + * Usually this function is called from f_op->mmap() handler + * under mm->mmap_lock write-lock, so it can change vma->vm_flags. + * Caller must set VM_MIXEDMAP on vma if it wants to call this + * function from other places, for example from page-fault handler. + * + * Return: %0 on success, negative error code otherwise. + */ +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!page_count(page)) + return -EINVAL; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(mmap_read_trylock(vma->vm_mm)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } + return insert_page(vma, addr, page, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_page); + +/* + * __vm_map_pages - maps range of kernel pages into user vma + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * @offset: user's requested vm_pgoff + * + * This allows drivers to map range of kernel pages into a user vma. + * + * Return: 0 on success and error code otherwise. + */ +static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num, unsigned long offset) +{ + unsigned long count = vma_pages(vma); + unsigned long uaddr = vma->vm_start; + int ret, i; + + /* Fail if the user requested offset is beyond the end of the object */ + if (offset >= num) + return -ENXIO; + + /* Fail if the user requested size exceeds available object size */ + if (count > num - offset) + return -ENXIO; + + for (i = 0; i < count; i++) { + ret = vm_insert_page(vma, uaddr, pages[offset + i]); + if (ret < 0) + return ret; + uaddr += PAGE_SIZE; + } + + return 0; +} + +/** + * vm_map_pages - maps range of kernel pages starts with non zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Maps an object consisting of @num pages, catering for the user's + * requested vm_pgoff + * + * If we fail to insert any page into the vma, the function will return + * immediately leaving any previously inserted pages present. Callers + * from the mmap handler may immediately return the error as their caller + * will destroy the vma, removing any successfully inserted pages. Other + * callers should make their own arrangements for calling unmap_region(). + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, vma->vm_pgoff); +} +EXPORT_SYMBOL(vm_map_pages); + +/** + * vm_map_pages_zero - map range of kernel pages starts with zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Similar to vm_map_pages(), except that it explicitly sets the offset + * to 0. This function is intended for the drivers that did not consider + * vm_pgoff. + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, 0); +} +EXPORT_SYMBOL(vm_map_pages_zero); + +static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, pgprot_t prot, bool mkwrite) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *pte, entry; + spinlock_t *ptl; + + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return VM_FAULT_OOM; + if (!pte_none(*pte)) { + if (mkwrite) { + /* + * For read faults on private mappings the PFN passed + * in may not match the PFN we have mapped if the + * mapped PFN is a writeable COW page. In the mkwrite + * case we are creating a writable PTE for a shared + * mapping and we expect the PFNs to match. If they + * don't match, we are likely racing with block + * allocation and mapping invalidation so just skip the + * update. + */ + if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); + goto out_unlock; + } + entry = pte_mkyoung(*pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, addr, pte, entry, 1)) + update_mmu_cache(vma, addr, pte); + } + goto out_unlock; + } + + /* Ok, finally just insert the thing.. */ + if (pfn_t_devmap(pfn)) + entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); + else + entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + + if (mkwrite) { + entry = pte_mkyoung(entry); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } + + set_pte_at(mm, addr, pte, entry); + update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ + +out_unlock: + pte_unmap_unlock(pte, ptl); + return VM_FAULT_NOPAGE; +} + +/** + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vmf_insert_pfn(), except that it allows drivers + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * COW mappings. In general, using multiple vmas is preferable; + * vmf_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + * + * See vmf_insert_mixed_prot() for a discussion of the implication of using + * a value of @pgprot different from that of @vma->vm_page_prot. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ + /* + * Technically, architectures with pte_special can avoid all these + * restrictions (same for remap_pfn_range). However we would like + * consistency in testing and feature parity among all, so we should + * try to keep these invariants in place for everybody. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + if (!pfn_modify_allowed(pfn, pgprot)) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); + + return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + false); +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); + +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vmf_insert_pfn); + +static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) +{ + /* these checks mirror the abort conditions in vm_normal_page */ + if (vma->vm_flags & VM_MIXEDMAP) + return true; + if (pfn_t_devmap(pfn)) + return true; + if (pfn_t_special(pfn)) + return true; + if (is_zero_pfn(pfn_t_to_pfn(pfn))) + return true; + return false; +} + +static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn, pgprot_t pgprot, + bool mkwrite) +{ + int err; + + BUG_ON(!vm_mixed_ok(vma, pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + track_pfn_insert(vma, &pgprot, pfn); + + if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) + return VM_FAULT_SIGBUS; + + /* + * If we don't have pte special, then we have to use the pfn_valid() + * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* + * refcount the page if pfn_valid is true (hence insert_page rather + * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP + * without pte special, it would there be refcounted as a normal page. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + struct page *page; + + /* + * At this point we are committed to insert_page() + * regardless of whether the caller specified flags that + * result in pfn_t_has_page() == false. + */ + page = pfn_to_page(pfn_t_to_pfn(pfn)); + err = insert_page(vma, addr, page, pgprot); + } else { + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); + } + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +/** + * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vmf_insert_mixed(), except that it allows drivers + * to override pgprot on a per-page basis. + * + * Typically this function should be used by drivers to set caching- and + * encryption bits different than those of @vma->vm_page_prot, because + * the caching- or encryption mode may not be known at mmap() time. + * This is ok as long as @vma->vm_page_prot is not used by the core vm + * to set caching and encryption bits for those vmas (except for COW pages). + * This is ensured by core vm only modifying these page table entries using + * functions that don't touch caching- or encryption bits, using pte_modify() + * if needed. (See for example mprotect()). + * Also when new page-table entries are created, this is only done using the + * fault() callback, and never using the value of vma->vm_page_prot, + * except for page-table entries that point to anonymous pages as the result + * of COW. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, pgprot_t pgprot) +{ + return __vm_insert_mixed(vma, addr, pfn, pgprot, false); +} +EXPORT_SYMBOL(vmf_insert_mixed_prot); + +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false); +} +EXPORT_SYMBOL(vmf_insert_mixed); + +/* + * If the insertion of PTE failed because someone else already added a + * different entry in the mean time, we treat that as success as we assume + * the same entry was actually inserted. + */ +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true); +} +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pte_t *pte, *mapped_pte; + spinlock_t *ptl; + int err = 0; + + mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + arch_enter_lazy_mmu_mode(); + do { + BUG_ON(!pte_none(*pte)); + if (!pfn_modify_allowed(pfn, prot)) { + err = -EACCES; + break; + } + set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + pfn++; + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(mapped_pte, ptl); + return err; +} + +static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pfn -= addr >> PAGE_SHIFT; + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); + do { + next = pmd_addr_end(addr, end); + err = remap_pte_range(mm, pmd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + pud_t *pud; + unsigned long next; + int err; + + pfn -= addr >> PAGE_SHIFT; + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = remap_pmd_range(mm, pud, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; + } while (pud++, addr = next, addr != end); + return 0; +} + +static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long pfn, pgprot_t prot) +{ + p4d_t *p4d; + unsigned long next; + int err; + + pfn -= addr >> PAGE_SHIFT; + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + err = remap_pud_range(mm, p4d, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; + } while (p4d++, addr = next, addr != end); + return 0; +} + +/* + * Variant of remap_pfn_range that does not call track_pfn_remap. The caller + * must have pre-validated the caching bits of the pgprot_t. + */ +int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + PAGE_ALIGN(size); + struct mm_struct *mm = vma->vm_mm; + int err; + + if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) + return -EINVAL; + + /* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + * + * There's a horrible special case to handle copy-on-write + * behaviour that some programs depend on. We mark the "original" + * un-COW'ed pages by matching them up with "vma->vm_pgoff". + * See vm_normal_page() for details. + */ + if (is_cow_mapping(vma->vm_flags)) { + if (addr != vma->vm_start || end != vma->vm_end) + return -EINVAL; + vma->vm_pgoff = pfn; + } + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + + BUG_ON(addr >= end); + pfn -= addr >> PAGE_SHIFT; + pgd = pgd_offset(mm, addr); + flush_cache_range(vma, addr, end); + do { + next = pgd_addr_end(addr, end); + err = remap_p4d_range(mm, pgd, addr, next, + pfn + (addr >> PAGE_SHIFT), prot); + if (err) + return err; + } while (pgd++, addr = next, addr != end); + + return 0; +} + +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target page aligned user address to start at + * @pfn: page frame number of kernel physical memory address + * @size: size of mapping area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. + */ +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + int err; + + err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + if (err) + return -EINVAL; + + err = remap_pfn_range_notrack(vma, addr, pfn, size, prot); + if (err) + untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + return err; +} +EXPORT_SYMBOL(remap_pfn_range); + +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of the physical memory to be mapped + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + * + * Return: %0 on success, negative error code otherwise. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + +static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) +{ + pte_t *pte, *mapped_pte; + int err = 0; + spinlock_t *ptl; + + if (create) { + mapped_pte = pte = (mm == &init_mm) ? + pte_alloc_kernel_track(pmd, addr, mask) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + } else { + mapped_pte = pte = (mm == &init_mm) ? + pte_offset_kernel(pmd, addr) : + pte_offset_map_lock(mm, pmd, addr, &ptl); + } + + BUG_ON(pmd_huge(*pmd)); + + arch_enter_lazy_mmu_mode(); + + if (fn) { + do { + if (create || !pte_none(*pte)) { + err = fn(pte++, addr, data); + if (err) + break; + } + } while (addr += PAGE_SIZE, addr != end); + } + *mask |= PGTBL_PTE_MODIFIED; + + arch_leave_lazy_mmu_mode(); + + if (mm != &init_mm) + pte_unmap_unlock(mapped_pte, ptl); + return err; +} + +static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + + BUG_ON(pud_huge(*pud)); + + if (create) { + pmd = pmd_alloc_track(mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + } else { + pmd = pmd_offset(pud, addr); + } + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) && !create) + continue; + if (WARN_ON_ONCE(pmd_leaf(*pmd))) + return -EINVAL; + if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) { + if (!create) + continue; + pmd_clear_bad(pmd); + } + err = apply_to_pte_range(mm, pmd, addr, next, + fn, data, create, mask); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + int err = 0; + + if (create) { + pud = pud_alloc_track(mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + } else { + pud = pud_offset(p4d, addr); + } + do { + next = pud_addr_end(addr, end); + if (pud_none(*pud) && !create) + continue; + if (WARN_ON_ONCE(pud_leaf(*pud))) + return -EINVAL; + if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) { + if (!create) + continue; + pud_clear_bad(pud); + } + err = apply_to_pmd_range(mm, pud, addr, next, + fn, data, create, mask); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, unsigned long end, + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + int err = 0; + + if (create) { + p4d = p4d_alloc_track(mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + } else { + p4d = p4d_offset(pgd, addr); + } + do { + next = p4d_addr_end(addr, end); + if (p4d_none(*p4d) && !create) + continue; + if (WARN_ON_ONCE(p4d_leaf(*p4d))) + return -EINVAL; + if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) { + if (!create) + continue; + p4d_clear_bad(p4d); + } + err = apply_to_pud_range(mm, p4d, addr, next, + fn, data, create, mask); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, + void *data, bool create) +{ + pgd_t *pgd; + unsigned long start = addr, next; + unsigned long end = addr + size; + pgtbl_mod_mask mask = 0; + int err = 0; + + if (WARN_ON(addr >= end)) + return -EINVAL; + + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none(*pgd) && !create) + continue; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return -EINVAL; + if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) { + if (!create) + continue; + pgd_clear_bad(pgd); + } + err = apply_to_p4d_range(mm, pgd, addr, next, + fn, data, create, &mask); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, start + size); + + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + return __apply_to_page_range(mm, addr, size, fn, data, true); +} +EXPORT_SYMBOL_GPL(apply_to_page_range); + +/* + * Scan a region of virtual memory, calling a provided function on + * each leaf page table where it exists. + * + * Unlike apply_to_page_range, this does _not_ fill in page tables + * where they are absent. + */ +int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + return __apply_to_page_range(mm, addr, size, fn, data, false); +} +EXPORT_SYMBOL_GPL(apply_to_existing_page_range); + +/* + * handle_pte_fault chooses page fault handler according to an entry which was + * read non-atomically. Before making any commitment, on those architectures + * or configurations (e.g. i386 with PAE) which might give a mix of unmatched + * parts, do_swap_page must check under lock before unmapping the pte and + * proceeding (but do_wp_page is only called after already making such a check; + * and do_anonymous_page can safely check later on). + */ +static inline int pte_unmap_same(struct vm_fault *vmf) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(ptl); + same = pte_same(*vmf->pte, vmf->orig_pte); + spin_unlock(ptl); + } +#endif + pte_unmap(vmf->pte); + vmf->pte = NULL; + return same; +} + +/* + * Return: + * 0: copied succeeded + * -EHWPOISON: copy failed due to hwpoison in source page + * -EAGAIN: copied failed (some other reason) + */ +static inline int __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) +{ + int ret; + void *kaddr; + void __user *uaddr; + bool locked = false; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = vmf->address; + + if (likely(src)) { + if (copy_mc_user_highpage(dst, src, addr, vma)) { + memory_failure_queue(page_to_pfn(src), 0); + return -EHWPOISON; + } + return 0; + } + + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + kaddr = kmap_atomic(dst); + uaddr = (void __user *)(addr & PAGE_MASK); + + /* + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ + if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + locked = true; + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* + * Other thread has already handled the fault + * and update local tlb only + */ + update_mmu_tlb(vma, addr, vmf->pte); + ret = -EAGAIN; + goto pte_unlock; + } + + entry = pte_mkyoung(vmf->orig_pte); + if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) + update_mmu_cache(vma, addr, vmf->pte); + } + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { + if (locked) + goto warn; + + /* Re-validate under PTL if the page is still mapped */ + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + locked = true; + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* The PTE changed under us, update local tlb */ + update_mmu_tlb(vma, addr, vmf->pte); + ret = -EAGAIN; + goto pte_unlock; + } + + /* + * The same page can be mapped back since last copy attempt. + * Try to copy again under PTL. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { + /* + * Give a warn in case there can be some obscure + * use-case + */ +warn: + WARN_ON_ONCE(1); + clear_page(kaddr); + } + } + + ret = 0; + +pte_unlock: + if (locked) + pte_unmap_unlock(vmf->pte, vmf->ptl); + kunmap_atomic(kaddr); + flush_dcache_page(dst); + + return ret; +} + +static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +{ + struct file *vm_file = vma->vm_file; + + if (vm_file) + return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; + + /* + * Special mappings (e.g. VDSO) do not have any file so fake + * a default GFP_KERNEL for them. + */ + return GFP_KERNEL; +} + +/* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) +{ + vm_fault_t ret; + struct page *page = vmf->page; + unsigned int old_flags = vmf->flags; + + vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + + if (vmf->vma->vm_file && + IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) + return VM_FAULT_SIGBUS; + + ret = vmf->vma->vm_ops->page_mkwrite(vmf); + /* Restore original flags so that caller is not surprised */ + vmf->flags = old_flags; + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + +/* + * Handle dirtying of a page in shared file mapping on a write fault. + * + * The function expects the page to be locked and unlocks it. + */ +static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping; + struct page *page = vmf->page; + bool dirtied; + bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; + + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ + mapping = page_rmapping(page); + unlock_page(page); + + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_lock before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ + if ((dirtied || page_mkwrite) && mapping) { + struct file *fpin; + + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + balance_dirty_pages_ratelimited(mapping); + if (fpin) { + fput(fpin); + return VM_FAULT_COMPLETED; + } + } + + return 0; +} + +/* + * Handle write page faults for pages that can be reused in the current vma + * + * This can happen either due to the mapping being with the VM_SHARED flag, + * or due to us being the last reference standing to the page. In either + * case, all we need to do here is to mark the page as writable and update + * any related book-keeping. + */ +static inline void wp_page_reuse(struct vm_fault *vmf) + __releases(vmf->ptl) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page = vmf->page; + pte_t entry; + + VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); + VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); + + /* + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + if (page) + page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); + + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + count_vm_event(PGREUSE); +} + +/* + * Handle the case of a page which we actually need to copy to a new page, + * either due to COW or unsharing. + * + * Called with mmap_lock locked and the old page referenced, but + * without the ptl held. + * + * High level logic flow: + * + * - Allocate a page, copy the content of the old page to the new one. + * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. + * - Take the PTL. If the pte changed, bail out and release the allocated page + * - If the pte is still the way we remember it, update the page table and all + * relevant references. This includes dropping the reference the page-table + * held to the old page, as well as updating the rmap. + * - In any case, unlock the PTL and drop the reference we took to the old page. + */ +static vm_fault_t wp_page_copy(struct vm_fault *vmf) +{ + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + struct page *old_page = vmf->page; + struct page *new_page = NULL; + pte_t entry; + int page_copied = 0; + struct mmu_notifier_range range; + int ret; + + delayacct_wpcopy_start(); + + if (unlikely(anon_vma_prepare(vma))) + goto oom; + + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); + if (!new_page) + goto oom; + + ret = __wp_page_copy_user(new_page, old_page, vmf); + if (ret) { + /* + * COW failed, if the fault was solved by other, + * it's fine. If not, userspace would re-fault on + * the same address and we will handle the fault + * from the second attempt. + * The -EHWPOISON case will not be retried. + */ + put_page(new_page); + if (old_page) + put_page(old_page); + + delayacct_wpcopy_end(); + return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + } + kmsan_copy_page_meta(new_page, old_page); + } + + if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) + goto oom_free_new; + cgroup_throttle_swaprate(new_page, GFP_KERNEL); + + __SetPageUptodate(new_page); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + + /* + * Re-check the pte - we dropped the lock + */ + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { + if (old_page) { + if (!PageAnon(old_page)) { + dec_mm_counter_fast(mm, + mm_counter_file(old_page)); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + } else { + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); + if (unlikely(unshare)) { + if (pte_soft_dirty(vmf->orig_pte)) + entry = pte_mksoft_dirty(entry); + if (pte_uffd_wp(vmf->orig_pte)) + entry = pte_mkuffd_wp(entry); + } else { + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } + + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry, to keep TLBs on different CPUs in + * sync. This code used to set the new PTE then flush TLBs, but + * that left a window where the new PTE could be loaded into + * some TLBs while the old PTE remains in others. + */ + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address); + lru_cache_add_inactive_or_unevictable(new_page, vma); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + BUG_ON(unshare && pte_write(entry)); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); + if (old_page) { + /* + * Only after switching the pte to the new page may + * we remove the mapcount here. Otherwise another + * process may come and find the rmap count decremented + * before the pte is switched to the new page, and + * "reuse" the old page writing into it while our pte + * here still points into it and can be read by other + * threads. + * + * The critical issue is to order this + * page_remove_rmap with the ptp_clear_flush above. + * Those stores are ordered by (if nothing else,) + * the barrier present in the atomic_add_negative + * in page_remove_rmap. + * + * Then the TLB flush in ptep_clear_flush ensures that + * no process can access the old page before the + * decremented mapcount is visible. And the old page + * cannot be reused until after the decremented + * mapcount is visible. So transitively, TLBs to + * old page will be flushed before it can be reused. + */ + page_remove_rmap(old_page, vma, false); + } + + /* Free the old page.. */ + new_page = old_page; + page_copied = 1; + } else { + update_mmu_tlb(vma, vmf->address, vmf->pte); + } + + if (new_page) + put_page(new_page); + + pte_unmap_unlock(vmf->pte, vmf->ptl); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(&range); + if (old_page) { + if (page_copied) + free_swap_cache(old_page); + put_page(old_page); + } + + delayacct_wpcopy_end(); + return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; +oom_free_new: + put_page(new_page); +oom: + if (old_page) + put_page(old_page); + + delayacct_wpcopy_end(); + return VM_FAULT_OOM; +} + +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + * + * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before + * we acquired PTE lock. + */ +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return VM_FAULT_NOPAGE; + } + wp_page_reuse(vmf); + return 0; +} + +/* + * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED + * mapping + */ +static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { + vm_fault_t ret; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->flags |= FAULT_FLAG_MKWRITE; + ret = vma->vm_ops->pfn_mkwrite(vmf); + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) + return ret; + return finish_mkwrite_fault(vmf); + } + wp_page_reuse(vmf); + return VM_FAULT_WRITE; +} + +static vm_fault_t wp_page_shared(struct vm_fault *vmf) + __releases(vmf->ptl) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = VM_FAULT_WRITE; + + get_page(vmf->page); + + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + vm_fault_t tmp; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vmf); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + put_page(vmf->page); + return tmp; + } + tmp = finish_mkwrite_fault(vmf); + if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { + unlock_page(vmf->page); + put_page(vmf->page); + return tmp; + } + } else { + wp_page_reuse(vmf); + lock_page(vmf->page); + } + ret |= fault_dirty_shared_page(vmf); + put_page(vmf->page); + + return ret; +} + +/* + * This routine handles present pages, when + * * users try to write to a shared page (FAULT_FLAG_WRITE) + * * GUP wants to take a R/O pin on a possibly shared anonymous page + * (FAULT_FLAG_UNSHARE) + * + * It is done by copying the page to a new address and decrementing the + * shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've + * done any necessary COW. + * + * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even + * though the page will change only once the write actually happens. This + * avoids a few races, and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_lock (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_lock still held, but pte unmapped and unlocked. + */ +static vm_fault_t do_wp_page(struct vm_fault *vmf) + __releases(vmf->ptl) +{ + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + + if (likely(!unshare)) { + if (userfaultfd_pte_wp(vma, *vmf->pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Userfaultfd write-protect can defer flushes. Ensure the TLB + * is flushed in this case before copying. + */ + if (unlikely(userfaultfd_wp(vmf->vma) && + mm_tlb_flush_pending(vmf->vma->vm_mm))) + flush_tlb_page(vmf->vma, vmf->address); + } + + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (!vmf->page) { + if (unlikely(unshare)) { + /* No anonymous page -> nothing to do. */ + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + + /* + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable and/or call ops->pfn_mkwrite. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + return wp_pfn_shared(vmf); + + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf); + } + + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + folio = page_folio(vmf->page); + if (folio_test_anon(folio)) { + /* + * If the page is exclusive to this process we must reuse the + * page without further checks. + */ + if (PageAnonExclusive(vmf->page)) + goto reuse; + + /* + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing + * the swapcache if there is little hope that we can reuse. + * + * KSM doesn't necessarily raise the folio refcount. + */ + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) + goto copy; + if (!folio_test_lru(folio)) + /* + * Note: We cannot easily detect+handle references from + * remote LRU pagevecs or references to LRU folios. + */ + lru_add_drain(); + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) + goto copy; + if (!folio_trylock(folio)) + goto copy; + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); + goto copy; + } + /* + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing + * sunglasses. Hit it. + */ + page_move_anon_rmap(vmf->page, vma); + folio_unlock(folio); +reuse: + if (unlikely(unshare)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + wp_page_reuse(vmf); + return VM_FAULT_WRITE; + } else if (unshare) { + /* No anonymous page -> nothing to do. */ + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + return wp_page_shared(vmf); + } +copy: + /* + * Ok, we need to copy. Oh, well.. + */ + get_page(vmf->page); + + pte_unmap_unlock(vmf->pte, vmf->ptl); +#ifdef CONFIG_KSM + if (PageKsm(vmf->page)) + count_vm_event(COW_KSM); +#endif + return wp_page_copy(vmf); +} + +static void unmap_mapping_range_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr, + struct zap_details *details) +{ + zap_page_range_single(vma, start_addr, end_addr - start_addr, details); +} + +static inline void unmap_mapping_range_tree(struct rb_root_cached *root, + pgoff_t first_index, + pgoff_t last_index, + struct zap_details *details) +{ + struct vm_area_struct *vma; + pgoff_t vba, vea, zba, zea; + + vma_interval_tree_foreach(vma, root, first_index, last_index) { + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma) - 1; + zba = max(first_index, vba); + zea = min(last_index, vea); + + unmap_mapping_range_vma(vma, + ((zba - vba) << PAGE_SHIFT) + vma->vm_start, + ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, + details); + } +} + +/** + * unmap_mapping_folio() - Unmap single folio from processes. + * @folio: The locked folio to be unmapped. + * + * Unmap this folio from any userspace process which still has it mmaped. + * Typically, for efficiency, the range of nearby pages has already been + * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once + * truncation or invalidation holds the lock on a folio, it may find that + * the page has been remapped again: and then uses unmap_mapping_folio() + * to unmap it finally. + */ +void unmap_mapping_folio(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + struct zap_details details = { }; + pgoff_t first_index; + pgoff_t last_index; + + VM_BUG_ON(!folio_test_locked(folio)); + + first_index = folio->index; + last_index = folio->index + folio_nr_pages(folio) - 1; + + details.even_cows = false; + details.single_folio = folio; + details.zap_flags = ZAP_FLAG_DROP_MARKER; + + i_mmap_lock_read(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); + i_mmap_unlock_read(mapping); +} + +/** + * unmap_mapping_pages() - Unmap pages from processes. + * @mapping: The address space containing pages to be unmapped. + * @start: Index of first page to be unmapped. + * @nr: Number of pages to be unmapped. 0 to unmap to end of file. + * @even_cows: Whether to unmap even private COWed pages. + * + * Unmap the pages in this address space from any userspace process which + * has them mmaped. Generally, you want to remove COWed pages as well when + * a file is being truncated, but not when invalidating pages from the page + * cache. + */ +void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, + pgoff_t nr, bool even_cows) +{ + struct zap_details details = { }; + pgoff_t first_index = start; + pgoff_t last_index = start + nr - 1; + + details.even_cows = even_cows; + if (last_index < first_index) + last_index = ULONG_MAX; + + i_mmap_lock_read(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); + i_mmap_unlock_read(mapping); +} +EXPORT_SYMBOL_GPL(unmap_mapping_pages); + +/** + * unmap_mapping_range - unmap the portion of all mmaps in the specified + * address_space corresponding to the specified byte range in the underlying + * file. + * + * @mapping: the address space containing mmaps to be unmapped. + * @holebegin: byte in first page to unmap, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from truncate_pagecache(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + * @even_cows: 1 when truncating a file, unmap even private COWed pages; + * but 0 when invalidating pagecache, don't throw away private data. + */ +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) +{ + pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT; + pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + + unmap_mapping_pages(mapping, hba, hlen, even_cows); +} +EXPORT_SYMBOL(unmap_mapping_range); + +/* + * Restore a potential device exclusive pte to a working pte entry + */ +static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + + /* + * We need a reference to lock the folio because we don't hold + * the PTL so a racing thread can remove the device-exclusive + * entry and unmap it. If the folio is free the entry must + * have been removed already. If it happens to have already + * been re-allocated after being freed all we do is lock and + * unlock it. + */ + if (!folio_try_get(folio)) + return 0; + + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { + folio_put(folio); + return VM_FAULT_RETRY; + } + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + vma->vm_mm, vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL); + mmu_notifier_invalidate_range_start(&range); + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); + + pte_unmap_unlock(vmf->pte, vmf->ptl); + folio_unlock(folio); + folio_put(folio); + + mmu_notifier_invalidate_range_end(&range); + return 0; +} + +static inline bool should_try_to_free_swap(struct folio *folio, + struct vm_area_struct *vma, + unsigned int fault_flags) +{ + if (!folio_test_swapcache(folio)) + return false; + if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || + folio_test_mlocked(folio)) + return true; + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * user. Try freeing the swapcache to get rid of the swapcache + * reference only in case it's likely that we'll be the exlusive user. + */ + return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && + folio_ref_count(folio) == 2; +} + +static vm_fault_t pte_marker_clear(struct vm_fault *vmf) +{ + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + /* + * Be careful so that we will only recover a special uffd-wp pte into a + * none pte. Otherwise it means the pte could have changed, so retry. + */ + if (is_pte_marker(*vmf->pte)) + pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; +} + +/* + * This is actually a page-missing access, but with uffd-wp special pte + * installed. It means this pte was wr-protected before being unmapped. + */ +static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf) +{ + /* + * Just in case there're leftover special ptes even after the region + * got unregistered - we can simply clear them. We can also do that + * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp + * ranges, but it should be more efficient to be done lazily here. + */ + if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma))) + return pte_marker_clear(vmf); + + /* do_fault() can handle pte markers too like none pte */ + return do_fault(vmf); +} + +static vm_fault_t handle_pte_marker(struct vm_fault *vmf) +{ + swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte); + unsigned long marker = pte_marker_get(entry); + + /* + * PTE markers should always be with file-backed memories, and the + * marker should never be empty. If anything weird happened, the best + * thing to do is to kill the process along with its mm. + */ + if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + return VM_FAULT_SIGBUS; + + if (pte_marker_entry_uffd_wp(entry)) + return pte_marker_handle_uffd_wp(vmf); + + /* This is an unknown pte marker */ + return VM_FAULT_SIGBUS; +} + +/* + * We enter with non-exclusive mmap_lock (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_lock locked or unlocked in the same cases + * as does filemap_fault(). + */ +vm_fault_t do_swap_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct folio *swapcache, *folio = NULL; + struct page *page; + struct swap_info_struct *si = NULL; + rmap_t rmap_flags = RMAP_NONE; + bool exclusive = false; + swp_entry_t entry; + pte_t pte; + int locked; + vm_fault_t ret = 0; + void *shadow = NULL; + + if (!pte_unmap_same(vmf)) + goto out; + + entry = pte_to_swp_entry(vmf->orig_pte); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); + } else if (is_device_exclusive_entry(entry)) { + vmf->page = pfn_swap_entry_to_page(entry); + ret = remove_device_exclusive_entry(vmf); + } else if (is_device_private_entry(entry)) { + vmf->page = pfn_swap_entry_to_page(entry); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else if (is_swapin_error_entry(entry)) { + ret = VM_FAULT_SIGBUS; + } else if (is_pte_marker_entry(entry)) { + ret = handle_pte_marker(vmf); + } else { + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); + ret = VM_FAULT_SIGBUS; + } + goto out; + } + + /* Prevent swapoff from happening to us. */ + si = get_swap_device(entry); + if (unlikely(!si)) + goto out; + + folio = swap_cache_get_folio(entry, vma, vmf->address); + if (folio) + page = folio_file_page(folio, swp_offset(entry)); + swapcache = folio; + + if (!folio) { + if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && + __swap_count(entry) == 1) { + /* skip swapcache */ + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, + vma, vmf->address, false); + page = &folio->page; + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + if (mem_cgroup_swapin_charge_folio(folio, + vma->vm_mm, GFP_KERNEL, + entry)) { + ret = VM_FAULT_OOM; + goto out_page; + } + mem_cgroup_swapin_uncharge_swap(entry); + + shadow = get_shadow_from_swap_cache(entry); + if (shadow) + workingset_refault(folio, shadow); + + folio_add_lru(folio); + + /* To provide entry to swap_readpage() */ + folio_set_swap_entry(folio, entry); + swap_readpage(page, true, NULL); + folio->private = NULL; + } + } else { + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + vmf); + if (page) + folio = page_folio(page); + swapcache = folio; + } + + if (!folio) { + /* + * Back out if somebody else faulted in this pte + * while we released the pte lock. + */ + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) + ret = VM_FAULT_OOM; + goto unlock; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); + } else if (PageHWPoison(page)) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_release; + } + + locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); + + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } + + if (swapcache) { + /* + * Make sure folio_free_swap() or swapoff did not release the + * swapcache from under us. The page pin, and pte_same test + * below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not + * changed. + */ + if (unlikely(!folio_test_swapcache(folio) || + page_private(page) != entry.val)) + goto out_page; + + /* + * KSM sometimes has to copy on read faults, for example, if + * page->index of !PageKSM() pages would be nonlinear inside the + * anon VMA -- PageKSM() is lost on actual swapout. + */ + page = ksm_might_need_to_copy(page, vma, vmf->address); + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + goto out_page; + } + folio = page_folio(page); + + /* + * If we want to map a page that's in the swapcache writable, we + * have to detect via the refcount if we're really the exclusive + * owner. Try removing the extra reference from the local LRU + * pagevecs if required. + */ + if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && + !folio_test_ksm(folio) && !folio_test_lru(folio)) + lru_add_drain(); + } + + cgroup_throttle_swaprate(page, GFP_KERNEL); + + /* + * Back out if somebody else already faulted in this pte. + */ + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) + goto out_nomap; + + if (unlikely(!folio_test_uptodate(folio))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; + } + + /* + * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte + * must never point at an anonymous page in the swapcache that is + * PG_anon_exclusive. Sanity check that this holds and especially, that + * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity + * check after taking the PT lock and making sure that nobody + * concurrently faulted in this page and set PG_anon_exclusive. + */ + BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); + BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); + + /* + * Check under PT lock (to protect against concurrent fork() sharing + * the swap entry concurrently) for certainly exclusive pages. + */ + if (!folio_test_ksm(folio)) { + /* + * Note that pte_swp_exclusive() == false for architectures + * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. + */ + exclusive = pte_swp_exclusive(vmf->orig_pte); + if (folio != swapcache) { + /* + * We have a fresh page that is not exposed to the + * swapcache -> certainly exclusive. + */ + exclusive = true; + } else if (exclusive && folio_test_writeback(folio) && + data_race(si->flags & SWP_STABLE_WRITES)) { + /* + * This is tricky: not all swap backends support + * concurrent page modifications while under writeback. + * + * So if we stumble over such a page in the swapcache + * we must not set the page exclusive, otherwise we can + * map it writable without further checks and modify it + * while still under writeback. + * + * For these problematic swap backends, simply drop the + * exclusive marker: this is perfectly fine as we start + * writeback only if we fully unmapped the page and + * there are no unexpected references on the page after + * unmapping succeeded. After fully unmapped, no + * further GUP references (FOLL_GET and FOLL_PIN) can + * appear, so dropping the exclusive marker and mapping + * it only R/O is fine. + */ + exclusive = false; + } + } + + /* + * Some architectures may have to restore extra metadata to the page + * when reading from swap. This metadata may be indexed by swap entry + * so this must be called before swap_free(). + */ + arch_swap_restore(entry, folio); + + /* + * Remove the swap entry and conditionally try to free up the swapcache. + * We're already holding a reference on the page but haven't mapped it + * yet. + */ + swap_free(entry); + if (should_try_to_free_swap(folio, vma, vmf->flags)) + folio_free_swap(folio); + + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + pte = mk_pte(page, vma->vm_page_prot); + + /* + * Same logic as in do_wp_page(); however, optimize for pages that are + * certainly not shared either because we just allocated them without + * exposing them to the swapcache or because the swap entry indicates + * exclusivity. + */ + if (!folio_test_ksm(folio) && + (exclusive || folio_ref_count(folio) == 1)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + pte = maybe_mkwrite(pte_mkdirty(pte), vma); + vmf->flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + } + rmap_flags |= RMAP_EXCLUSIVE; + } + flush_icache_page(vma, page); + if (pte_swp_soft_dirty(vmf->orig_pte)) + pte = pte_mksoft_dirty(pte); + if (pte_swp_uffd_wp(vmf->orig_pte)) { + pte = pte_mkuffd_wp(pte); + pte = pte_wrprotect(pte); + } + vmf->orig_pte = pte; + + /* ksm created a completely new copy */ + if (unlikely(folio != swapcache && swapcache)) { + page_add_new_anon_rmap(page, vma, vmf->address); + folio_add_lru_vma(folio, vma); + } else { + page_add_anon_rmap(page, vma, vmf->address, rmap_flags); + } + + VM_BUG_ON(!folio_test_anon(folio) || + (pte_write(pte) && !PageAnonExclusive(page))); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + + folio_unlock(folio); + if (folio != swapcache && swapcache) { + /* + * Hold the lock to avoid the swap entry to be reused + * until we take the PT lock for the pte_same() check + * (to avoid false positives from pte_same). For + * further safety release the lock after the swap_free + * so that the swap count won't change under a + * parallel locked swapcache. + */ + folio_unlock(swapcache); + folio_put(swapcache); + } + + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf); + if (ret & VM_FAULT_ERROR) + ret &= VM_FAULT_ERROR; + goto out; + } + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, vmf->address, vmf->pte); +unlock: + pte_unmap_unlock(vmf->pte, vmf->ptl); +out: + if (si) + put_swap_device(si); + return ret; +out_nomap: + pte_unmap_unlock(vmf->pte, vmf->ptl); +out_page: + folio_unlock(folio); +out_release: + folio_put(folio); + if (folio != swapcache && swapcache) { + folio_unlock(swapcache); + folio_put(swapcache); + } + if (si) + put_swap_device(si); + return ret; +} + +/* + * We enter with non-exclusive mmap_lock (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_lock still held, but pte unmapped and unlocked. + */ +static vm_fault_t do_anonymous_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page; + vm_fault_t ret = 0; + pte_t entry; + + /* File mapping without ->vm_ops ? */ + if (vma->vm_flags & VM_SHARED) + return VM_FAULT_SIGBUS; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when + * parallel threads are excluded by other means. + * + * Here we only have mmap_read_lock(mm). + */ + if (pte_alloc(vma->vm_mm, vmf->pmd)) + return VM_FAULT_OOM; + + /* See comment in handle_pte_fault() */ + if (unlikely(pmd_trans_unstable(vmf->pmd))) + return 0; + + /* Use the zero-page for reads */ + if (!(vmf->flags & FAULT_FLAG_WRITE) && + !mm_forbids_zeropage(vma->vm_mm)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), + vma->vm_page_prot)); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); + goto unlock; + } + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto unlock; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); + } + goto setpte; + } + + /* Allocate our own private page. */ + if (unlikely(anon_vma_prepare(vma))) + goto oom; + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); + if (!page) + goto oom; + + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + goto oom_free_page; + cgroup_throttle_swaprate(page, GFP_KERNEL); + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) { + update_mmu_tlb(vma, vmf->address, vmf->pte); + goto release; + } + + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + put_page(page); + return handle_userfault(vmf, VM_UFFD_MISSING); + } + + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, vmf->address); + lru_cache_add_inactive_or_unevictable(page, vma); +setpte: + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, vmf->address, vmf->pte); +unlock: + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +release: + put_page(page); + goto unlock; +oom_free_page: + put_page(page); +oom: + return VM_FAULT_OOM; +} + +/* + * The mmap_lock must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ +static vm_fault_t __do_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret; + + /* + * Preallocate pte before we take page_lock because this might lead to + * deadlocks for memcg reclaim which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_one + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear the writeback + */ + if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + } + + ret = vma->vm_ops->fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DONE_COW))) + return ret; + + if (unlikely(PageHWPoison(vmf->page))) { + struct page *page = vmf->page; + vm_fault_t poisonret = VM_FAULT_HWPOISON; + if (ret & VM_FAULT_LOCKED) { + if (page_mapped(page)) + unmap_mapping_pages(page_mapping(page), + page->index, 1, false); + /* Retry if a clean page was removed from the cache. */ + if (invalidate_inode_page(page)) + poisonret = VM_FAULT_NOPAGE; + unlock_page(page); + } + put_page(page); + vmf->page = NULL; + return poisonret; + } + + if (unlikely(!(ret & VM_FAULT_LOCKED))) + lock_page(vmf->page); + else + VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); + + return ret; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void deposit_prealloc_pte(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + mm_inc_nr_ptes(vma->vm_mm); + vmf->prealloc_pte = NULL; +} + +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +{ + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + pmd_t entry; + int i; + vm_fault_t ret = VM_FAULT_FALLBACK; + + if (!transhuge_vma_suitable(vma, haddr)) + return ret; + + page = compound_head(page); + if (compound_order(page) != HPAGE_PMD_ORDER) + return ret; + + /* + * Just backoff if any subpage of a THP is corrupted otherwise + * the corrupted page may mapped by PMD silently to escape the + * check. This kind of THP just can be PTE mapped. Access to + * the corrupted subpage should trigger SIGBUS as expected. + */ + if (unlikely(PageHasHWPoisoned(page))) + return ret; + + /* + * Archs like ppc64 need additional space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + } + + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) + goto out; + + for (i = 0; i < HPAGE_PMD_NR; i++) + flush_icache_page(vma, page + i); + + entry = mk_huge_pmd(page, vma->vm_page_prot); + if (write) + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + + add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); + page_add_file_rmap(page, vma, true); + + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(vmf); + + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + + update_mmu_cache_pmd(vma, haddr, vmf->pmd); + + /* fault is handled */ + ret = 0; + count_vm_event(THP_FILE_MAPPED); +out: + spin_unlock(vmf->ptl); + return ret; +} +#else +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +{ + return VM_FAULT_FALLBACK; +} +#endif + +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +{ + struct vm_area_struct *vma = vmf->vma; + bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte); + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = vmf->address != addr; + pte_t entry; + + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (unlikely(uffd_wp)) + entry = pte_mkuffd_wp(pte_wrprotect(entry)); + /* copy-on-write page */ + if (write && !(vma->vm_flags & VM_SHARED)) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + lru_cache_add_inactive_or_unevictable(page, vma); + } else { + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + page_add_file_rmap(page, vma, false); + } + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); +} + +static bool vmf_pte_changed(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID) + return !pte_same(*vmf->pte, vmf->orig_pte); + + return !pte_none(*vmf->pte); +} + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + * + * Return: %0 on success, %VM_FAULT_ code in case of error. + */ +vm_fault_t finish_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page; + vm_fault_t ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + + /* + * check even for read faults because we might have lost our CoWed + * page + */ + if (!(vma->vm_flags & VM_SHARED)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + return ret; + } + + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (vmf->prealloc_pte) + pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); + else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + return VM_FAULT_OOM; + } + + /* + * See comment in handle_pte_fault() for how this scenario happens, we + * need to return NOPAGE so that we drop this page. + */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return VM_FAULT_NOPAGE; + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + + /* Re-check under ptl */ + if (likely(!vmf_pte_changed(vmf))) { + do_set_pte(vmf, page, vmf->address); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, vmf->address, vmf->pte); + + ret = 0; + } else { + update_mmu_tlb(vma, vmf->address, vmf->pte); + ret = VM_FAULT_NOPAGE; + } + + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + +static unsigned long fault_around_bytes __read_mostly = + rounddown_pow_of_two(65536); + +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) +{ + *val = fault_around_bytes; + return 0; +} + +/* + * fault_around_bytes must be rounded down to the nearest page order as it's + * what do_fault_around() expects to see. + */ +static int fault_around_bytes_set(void *data, u64 val) +{ + if (val / PAGE_SIZE > PTRS_PER_PTE) + return -EINVAL; + if (val > PAGE_SIZE) + fault_around_bytes = rounddown_pow_of_two(val); + else + fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); + return 0; +} +late_initcall(fault_around_debugfs); +#endif + +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_bytes defines how many bytes we'll try to map. + * do_fault_around() expects it to be set to a power of two less than or equal + * to PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to + * fault_around_bytes rounded down to the machine page size + * (and therefore to page order). This way it's easier to guarantee + * that we don't cross page table boundaries. + */ +static vm_fault_t do_fault_around(struct vm_fault *vmf) +{ + unsigned long address = vmf->address, nr_pages, mask; + pgoff_t start_pgoff = vmf->pgoff; + pgoff_t end_pgoff; + int off; + + nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; + mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; + + address = max(address & mask, vmf->vma->vm_start); + off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + start_pgoff -= off; + + /* + * end_pgoff is either the end of the page table, the end of + * the vma or nr_pages from start_pgoff, depending what is nearest. + */ + end_pgoff = start_pgoff - + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, + start_pgoff + nr_pages - 1); + + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + } + + return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); +} + +/* Return true if we should do read fault-around, false otherwise */ +static inline bool should_fault_around(struct vm_fault *vmf) +{ + /* No ->map_pages? No way to fault around... */ + if (!vmf->vma->vm_ops->map_pages) + return false; + + if (uffd_disable_fault_around(vmf->vma)) + return false; + + return fault_around_bytes >> PAGE_SHIFT > 1; +} + +static vm_fault_t do_read_fault(struct vm_fault *vmf) +{ + vm_fault_t ret = 0; + + /* + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). + */ + if (should_fault_around(vmf)) { + ret = do_fault_around(vmf); + if (ret) + return ret; + } + + ret = __do_fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + ret |= finish_fault(vmf); + unlock_page(vmf->page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + put_page(vmf->page); + return ret; +} + +static vm_fault_t do_cow_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret; + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (!vmf->cow_page) + return VM_FAULT_OOM; + + if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm, + GFP_KERNEL)) { + put_page(vmf->cow_page); + return VM_FAULT_OOM; + } + cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); + + ret = __do_fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; + + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + __SetPageUptodate(vmf->cow_page); + + ret |= finish_fault(vmf); + unlock_page(vmf->page); + put_page(vmf->page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; + return ret; +uncharge_out: + put_page(vmf->cow_page); + return ret; +} + +static vm_fault_t do_shared_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret, tmp; + + ret = __do_fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + /* + * Check if the backing address space wants to know that the page is + * about to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(vmf->page); + tmp = do_page_mkwrite(vmf); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + put_page(vmf->page); + return tmp; + } + } + + ret |= finish_fault(vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) { + unlock_page(vmf->page); + put_page(vmf->page); + return ret; + } + + ret |= fault_dirty_shared_page(vmf); + return ret; +} + +/* + * We enter with non-exclusive mmap_lock (to exclude vma changes, + * but allow concurrent faults). + * The mmap_lock may have been released depending on flags and our + * return value. See filemap_fault() and __folio_lock_or_retry(). + * If mmap_lock is released, vma may become invalid (for example + * by other thread calling munmap()). + */ +static vm_fault_t do_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; + vm_fault_t ret; + + /* + * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND + */ + if (!vma->vm_ops->fault) { + /* + * If we find a migration pmd entry or a none pmd entry, which + * should never happen, return SIGBUS + */ + if (unlikely(!pmd_present(*vmf->pmd))) + ret = VM_FAULT_SIGBUS; + else { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, + vmf->pmd, + vmf->address, + &vmf->ptl); + /* + * Make sure this is not a temporary clearing of pte + * by holding ptl and checking again. A R/M/W update + * of pte involves: take ptl, clearing the pte so that + * we don't have concurrent modification by hardware + * followed by an update. + */ + if (unlikely(pte_none(*vmf->pte))) + ret = VM_FAULT_SIGBUS; + else + ret = VM_FAULT_NOPAGE; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (!(vmf->flags & FAULT_FLAG_WRITE)) + ret = do_read_fault(vmf); + else if (!(vma->vm_flags & VM_SHARED)) + ret = do_cow_fault(vmf); + else + ret = do_shared_fault(vmf); + + /* preallocated pagetable is unused: free it */ + if (vmf->prealloc_pte) { + pte_free(vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + return ret; +} + +int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, int *flags) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == numa_node_id()) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } + + return mpol_misplaced(page, vma, addr); +} + +static vm_fault_t do_numa_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct page *page = NULL; + int page_nid = NUMA_NO_NODE; + int last_cpupid; + int target_nid; + pte_t pte, old_pte; + bool was_writable = pte_savedwrite(vmf->orig_pte); + int flags = 0; + + /* + * The "pte" at this point cannot be used safely without + * validation through pte_unmap_same(). It's of NUMA type but + * the pfn may be screwed if the read is non atomic. + */ + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; + } + + /* Get the normal PTE */ + old_pte = ptep_get(vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + + page = vm_normal_page(vma, vmf->address, pte); + if (!page || is_zone_device_page(page)) + goto out_map; + + /* TODO: handle PTE-mapped THP */ + if (PageCompound(page)) + goto out_map; + + /* + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. + */ + if (!was_writable) + flags |= TNF_NO_GROUP; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + page_nid = page_to_nid(page); + /* + * For memory tiering mode, cpupid of slow memory page is used + * to record page access time. So use default value. + */ + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(page_nid)) + last_cpupid = (-1 & LAST_CPUPID_MASK); + else + last_cpupid = page_cpupid_last(page); + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, + &flags); + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out_map; + } + pte_unmap_unlock(vmf->pte, vmf->ptl); + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, vma, target_nid)) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } else { + flags |= TNF_MIGRATE_FAIL; + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; + } + goto out_map; + } + +out: + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, 1, flags); + return 0; +out_map: + /* + * Make it present again, depending on how arch implements + * non-accessible ptes, some can allow access by kernel mode. + */ + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); + goto out; +} + +static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) +{ + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_anonymous_page(vmf); + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + return VM_FAULT_FALLBACK; +} + +/* `inline' is required to avoid gcc 4.1.2 build error */ +static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) +{ + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + + if (vma_is_anonymous(vmf->vma)) { + if (likely(!unshare) && + userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) + return handle_userfault(vmf, VM_UFFD_WP); + return do_huge_pmd_wp_page(vmf); + } + if (vmf->vma->vm_ops->huge_fault) { + vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } + + /* COW or write-notify handled on pte level: split pmd. */ + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); + + return VM_FAULT_FALLBACK; +} + +static vm_fault_t create_huge_pud(struct vm_fault *vmf) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + goto split; + if (vmf->vma->vm_ops->huge_fault) { + vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } +split: + /* COW or write-notify not handled on PUD level: split pud.*/ + __split_huge_pud(vmf->vma, vmf->pud, vmf->address); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ + return VM_FAULT_FALLBACK; +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow + * concurrent faults). + * + * The mmap_lock may have been released depending on flags and our return value. + * See filemap_fault() and __folio_lock_or_retry(). + */ +static vm_fault_t handle_pte_fault(struct vm_fault *vmf) +{ + pte_t entry; + + if (unlikely(pmd_none(*vmf->pmd))) { + /* + * Leave __pte_alloc() until later: because vm_ops->fault may + * want to allocate huge page, and if we expose page table + * for an instant, it will be difficult to retract from + * concurrent faults and from rmap lookups. + */ + vmf->pte = NULL; + vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; + } else { + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + /* + * A regular pmd is established and it can't morph into a huge + * pmd from under us anymore at this point because we hold the + * mmap_lock read mode and khugepaged takes it in write mode. + * So now it's safe to run pte_offset_map(). + */ + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); + vmf->orig_pte = *vmf->pte; + vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID; + + /* + * some architectures can have larger ptes than wordsize, + * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and + * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic + * accesses. The code below just needs a consistent view + * for the ifs and we later double check anyway with the + * ptl lock held. So here a barrier will do. + */ + barrier(); + if (pte_none(vmf->orig_pte)) { + pte_unmap(vmf->pte); + vmf->pte = NULL; + } + } + + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); + else + return do_fault(vmf); + } + + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); + + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); + + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + entry = vmf->orig_pte; + if (unlikely(!pte_same(*vmf->pte, entry))) { + update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); + goto unlock; + } + if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { + if (!pte_write(entry)) + return do_wp_page(vmf); + else if (likely(vmf->flags & FAULT_FLAG_WRITE)) + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); + } else { + /* Skip spurious TLB flush for retried page fault */ + if (vmf->flags & FAULT_FLAG_TRIED) + goto unlock; + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); + } +unlock: + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; +} + +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_lock may have been released depending on flags and our + * return value. See filemap_fault() and __folio_lock_or_retry(). + */ +static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + struct vm_fault vmf = { + .vma = vma, + .address = address & PAGE_MASK, + .real_address = address, + .flags = flags, + .pgoff = linear_page_index(vma, address), + .gfp_mask = __get_fault_gfp_mask(vma), + }; + struct mm_struct *mm = vma->vm_mm; + unsigned long vm_flags = vma->vm_flags; + pgd_t *pgd; + p4d_t *p4d; + vm_fault_t ret; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return VM_FAULT_OOM; + + vmf.pud = pud_alloc(mm, p4d, address); + if (!vmf.pud) + return VM_FAULT_OOM; +retry_pud: + if (pud_none(*vmf.pud) && + hugepage_vma_check(vma, vm_flags, false, true, true)) { + ret = create_huge_pud(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + pud_t orig_pud = *vmf.pud; + + barrier(); + if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + + /* + * TODO once we support anonymous PUDs: NUMA case and + * FAULT_FLAG_UNSHARE handling. + */ + if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) { + ret = wp_huge_pud(&vmf, orig_pud); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pud_set_accessed(&vmf, orig_pud); + return 0; + } + } + } + + vmf.pmd = pmd_alloc(mm, vmf.pud, address); + if (!vmf.pmd) + return VM_FAULT_OOM; + + /* Huge pud page fault raced with pmd_alloc? */ + if (pud_trans_unstable(vmf.pud)) + goto retry_pud; + + if (pmd_none(*vmf.pmd) && + hugepage_vma_check(vma, vm_flags, false, true, true)) { + ret = create_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + vmf.orig_pmd = *vmf.pmd; + + barrier(); + if (unlikely(is_swap_pmd(vmf.orig_pmd))) { + VM_BUG_ON(thp_migration_supported() && + !is_pmd_migration_entry(vmf.orig_pmd)); + if (is_pmd_migration_entry(vmf.orig_pmd)) + pmd_migration_entry_wait(mm, vmf.pmd); + return 0; + } + if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) { + if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) + return do_huge_pmd_numa_page(&vmf); + + if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && + !pmd_write(vmf.orig_pmd)) { + ret = wp_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pmd_set_accessed(&vmf); + return 0; + } + } + } + + return handle_pte_fault(&vmf); +} + +/** + * mm_account_fault - Do page fault accounting + * + * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting + * of perf event counters, but we'll still do the per-task accounting to + * the task who triggered this page fault. + * @address: the faulted address. + * @flags: the fault flags. + * @ret: the fault retcode. + * + * This will take care of most of the page fault accounting. Meanwhile, it + * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter + * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should + * still be in per-arch page fault handlers at the entry of page fault. + */ +static inline void mm_account_fault(struct pt_regs *regs, + unsigned long address, unsigned int flags, + vm_fault_t ret) +{ + bool major; + + /* + * We don't do accounting for some specific faults: + * + * - Unsuccessful faults (e.g. when the address wasn't valid). That + * includes arch_vma_access_permitted() failing before reaching here. + * So this is not a "this many hardware page faults" counter. We + * should use the hw profiling for that. + * + * - Incomplete faults (VM_FAULT_RETRY). They will only be counted + * once they're completed. + */ + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY)) + return; + + /* + * We define the fault as a major fault when the final successful fault + * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't + * handle it immediately previously). + */ + major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED); + + if (major) + current->maj_flt++; + else + current->min_flt++; + + /* + * If the fault is done for GUP, regs will be NULL. We only do the + * accounting for the per thread fault counters who triggered the + * fault, and we skip the perf event updates. + */ + if (!regs) + return; + + if (major) + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); + else + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); +} + +#ifdef CONFIG_LRU_GEN +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ + /* the LRU algorithm doesn't apply to sequential or random reads */ + current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)); +} + +static void lru_gen_exit_fault(void) +{ + current->in_lru_fault = false; +} +#else +static void lru_gen_enter_fault(struct vm_area_struct *vma) +{ +} + +static void lru_gen_exit_fault(void) +{ +} +#endif /* CONFIG_LRU_GEN */ + +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_lock may have been released depending on flags and our + * return value. See filemap_fault() and __folio_lock_or_retry(). + */ +vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + unsigned int flags, struct pt_regs *regs) +{ + vm_fault_t ret; + + __set_current_state(TASK_RUNNING); + + count_vm_event(PGFAULT); + count_memcg_event_mm(vma->vm_mm, PGFAULT); + + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + + /* + * Enable the memcg OOM handling for faults triggered in user + * space. Kernel faults are handled more gracefully. + */ + if (flags & FAULT_FLAG_USER) + mem_cgroup_enter_user_fault(); + + lru_gen_enter_fault(vma); + + if (unlikely(is_vm_hugetlb_page(vma))) + ret = hugetlb_fault(vma->vm_mm, vma, address, flags); + else + ret = __handle_mm_fault(vma, address, flags); + + lru_gen_exit_fault(); + + if (flags & FAULT_FLAG_USER) { + mem_cgroup_exit_user_fault(); + /* + * The task may have entered a memcg OOM situation but + * if the allocation error was handled gracefully (no + * VM_FAULT_OOM), there is no need to kill anything. + * Just clean up the OOM state peacefully. + */ + if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) + mem_cgroup_oom_synchronize(false); + } + + mm_account_fault(regs, address, flags, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(handle_mm_fault); + +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include + +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + /* Even if this succeeds, make it clear we *might* have slept */ + if (likely(mmap_read_trylock(mm))) { + might_sleep(); + return true; + } + + if (regs && !user_mode(regs)) { + unsigned long ip = instruction_pointer(regs); + if (!search_exception_tables(ip)) + return false; + } + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = instruction_pointer(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} +#endif + +#ifndef __PAGETABLE_P4D_FOLDED +/* + * Allocate p4d page table. + * We've already handled the fast-path in-line. + */ +int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + p4d_t *new = p4d_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) { /* Another has populated it */ + p4d_free(mm, new); + } else { + smp_wmb(); /* See comment in pmd_install() */ + pgd_populate(mm, pgd, new); + } + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_P4D_FOLDED */ + +#ifndef __PAGETABLE_PUD_FOLDED +/* + * Allocate page upper directory. + * We've already handled the fast-path in-line. + */ +int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) +{ + pud_t *new = pud_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ + p4d_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#endif /* __PAGETABLE_PUD_FOLDED */ + +#ifndef __PAGETABLE_PMD_FOLDED +/* + * Allocate page middle directory. + * We've already handled the fast-path in-line. + */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + spinlock_t *ptl; + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) + return -ENOMEM; + + ptl = pud_lock(mm, pud); + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ + pud_populate(mm, pud, new); + } else { /* Another has populated it */ + pmd_free(mm, new); + } + spin_unlock(ptl); + return 0; +} +#endif /* __PAGETABLE_PMD_FOLDED */ + +/** + * follow_pte - look up PTE at a user virtual address + * @mm: the mm_struct of the target address space + * @address: user virtual address + * @ptepp: location to store found PTE + * @ptlp: location to store the lock for the PTE + * + * On a successful return, the pointer to the PTE is stored in @ptepp; + * the corresponding lock is taken and its location is stored in @ptlp. + * The contents of the PTE are only stable until @ptlp is released; + * any further use, if any, must be protected against invalidation + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + * it is not a good general-purpose API. + * + * Return: zero on success, -ve otherwise. + */ +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out; + + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out; + + pud = pud_offset(p4d, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out; + + pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); + + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, address, ptlp); + if (!pte_present(*ptep)) + goto unlock; + *ptepp = ptep; + return 0; +unlock: + pte_unmap_unlock(ptep, *ptlp); +out: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(follow_pte); + +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * This function does not allow the caller to read the permissions + * of the PTE. Do not use it. + * + * Return: zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +#ifdef CONFIG_HAVE_IOREMAP_PROT +int follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot, resource_size_t *phys) +{ + int ret = -EINVAL; + pte_t *ptep, pte; + spinlock_t *ptl; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; + + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + goto out; + pte = *ptep; + + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + + *prot = pgprot_val(pte_pgprot(pte)); + *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + + ret = 0; +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return ret; +} + +/** + * generic_access_phys - generic implementation for iomem mmap access + * @vma: the vma to access + * @addr: userspace address, not relative offset within @vma + * @buf: buffer to read/write + * @len: length of transfer + * @write: set to FOLL_WRITE when writing, otherwise reading + * + * This is a generic implementation for &vm_operations_struct.access for an + * iomem mapping. This callback is used by access_process_vm() when the @vma is + * not page based. + */ +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void __iomem *maddr; + pte_t *ptep, pte; + spinlock_t *ptl; + int offset = offset_in_page(addr); + int ret = -EINVAL; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + +retry: + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + return -EINVAL; + pte = *ptep; + pte_unmap_unlock(ptep, ptl); + + prot = pgprot_val(pte_pgprot(pte)); + phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; + + if ((write & FOLL_WRITE) && !pte_write(pte)) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); + if (!maddr) + return -ENOMEM; + + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + goto out_unmap; + + if (!pte_same(pte, *ptep)) { + pte_unmap_unlock(ptep, ptl); + iounmap(maddr); + + goto retry; + } + + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + ret = len; + pte_unmap_unlock(ptep, ptl); +out_unmap: + iounmap(maddr); + + return ret; +} +EXPORT_SYMBOL_GPL(generic_access_phys); +#endif + +/* + * Access another process' address space as given in mm. + */ +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) +{ + struct vm_area_struct *vma; + void *old_buf = buf; + int write = gup_flags & FOLL_WRITE; + + if (mmap_read_lock_killable(mm)) + return 0; + + /* We might need to expand the stack to access it */ + vma = vma_lookup(mm, addr); + if (!vma) { + vma = expand_stack(mm, addr); + if (!vma) + return 0; + } + + /* ignore errors, just check how much was successfully transferred */ + while (len) { + int bytes, ret, offset; + void *maddr; + struct page *page = NULL; + + ret = get_user_pages_remote(mm, addr, 1, + gup_flags, &page, &vma, NULL); + if (ret <= 0) { +#ifndef CONFIG_HAVE_IOREMAP_PROT + break; +#else + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ + vma = vma_lookup(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) + break; + bytes = ret; +#endif + } else { + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + put_page(page); + } + len -= bytes; + buf += bytes; + addr += bytes; + } + mmap_read_unlock(mm); + + return buf - old_buf; +} + +/** + * access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @gup_flags: flags modifying lookup behaviour + * + * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from source to destination. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + return __access_remote_vm(mm, addr, buf, len, gup_flags); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); + + mmput(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(access_process_vm); + +/* + * Print the name of a VMA. + */ +void print_vma_addr(char *prefix, unsigned long ip) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + + /* + * we might be running from an atomic context so we cannot sleep + */ + if (!mmap_read_trylock(mm)) + return; + + vma = find_vma(mm, ip); + if (vma && vma->vm_file) { + struct file *f = vma->vm_file; + char *buf = (char *)__get_free_page(GFP_NOWAIT); + if (buf) { + char *p; + + p = file_path(f, buf, PAGE_SIZE); + if (IS_ERR(p)) + p = "?"; + printk("%s%s[%lx+%lx]", prefix, kbasename(p), + vma->vm_start, + vma->vm_end - vma->vm_start); + free_page((unsigned long)buf); + } + } + mmap_read_unlock(mm); +} + +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP) +void __might_fault(const char *file, int line) +{ + if (pagefault_disabled()) + return; + __might_sleep(file, line); +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) + if (current->mm) + might_lock_read(¤t->mm->mmap_lock); +#endif +} +EXPORT_SYMBOL(__might_fault); +#endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +/* + * Process all subpages of the specified huge page with the specified + * operation. The target subpage will be processed last to keep its + * cache lines hot. + */ +static inline void process_huge_page( + unsigned long addr_hint, unsigned int pages_per_huge_page, + void (*process_subpage)(unsigned long addr, int idx, void *arg), + void *arg) +{ + int i, n, base, l; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + + /* Process target subpage last to keep its cache lines hot */ + might_sleep(); + n = (addr_hint - addr) / PAGE_SIZE; + if (2 * n <= pages_per_huge_page) { + /* If target subpage in first half of huge page */ + base = 0; + l = n; + /* Process subpages at the end of huge page */ + for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + cond_resched(); + process_subpage(addr + i * PAGE_SIZE, i, arg); + } + } else { + /* If target subpage in second half of huge page */ + base = pages_per_huge_page - 2 * (pages_per_huge_page - n); + l = pages_per_huge_page - n; + /* Process subpages at the begin of huge page */ + for (i = 0; i < base; i++) { + cond_resched(); + process_subpage(addr + i * PAGE_SIZE, i, arg); + } + } + /* + * Process remaining subpages in left-right-left-right pattern + * towards the target subpage + */ + for (i = 0; i < l; i++) { + int left_idx = base + i; + int right_idx = base + 2 * l - 1 - i; + + cond_resched(); + process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg); + cond_resched(); + process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg); + } +} + +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + p = nth_page(page, i); + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} + +static void clear_subpage(unsigned long addr, int idx, void *arg) +{ + struct page *page = arg; + + clear_user_highpage(page + idx, addr); +} + +void clear_huge_page(struct page *page, + unsigned long addr_hint, unsigned int pages_per_huge_page) +{ + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + process_huge_page(addr_hint, pages_per_huge_page, clear_subpage, page); +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; i++) { + dst = nth_page(dst_base, i); + src = nth_page(src_base, i); + + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + } +} + +struct copy_subpage_arg { + struct page *dst; + struct page *src; + struct vm_area_struct *vma; +}; + +static void copy_subpage(unsigned long addr, int idx, void *arg) +{ + struct copy_subpage_arg *copy_arg = arg; + + copy_user_highpage(copy_arg->dst + idx, copy_arg->src + idx, + addr, copy_arg->vma); +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr_hint, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); + struct copy_subpage_arg arg = { + .dst = dst, + .src = src, + .vma = vma, + }; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + process_huge_page(addr_hint, pages_per_huge_page, copy_subpage, &arg); +} + +long copy_huge_page_from_user(struct page *dst_page, + const void __user *usr_src, + unsigned int pages_per_huge_page, + bool allow_pagefault) +{ + void *page_kaddr; + unsigned long i, rc = 0; + unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; + struct page *subpage; + + for (i = 0; i < pages_per_huge_page; i++) { + subpage = nth_page(dst_page, i); + if (allow_pagefault) + page_kaddr = kmap(subpage); + else + page_kaddr = kmap_atomic(subpage); + rc = copy_from_user(page_kaddr, + usr_src + i * PAGE_SIZE, PAGE_SIZE); + if (allow_pagefault) + kunmap(subpage); + else + kunmap_atomic(page_kaddr); + + ret_val -= (PAGE_SIZE - rc); + if (rc) + break; + + flush_dcache_page(subpage); + + cond_resched(); + } + return ret_val; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS + +static struct kmem_cache *page_ptl_cachep; + +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + return true; +} + +void ptlock_free(struct page *page) +{ + kmem_cache_free(page_ptl_cachep, page->ptl); +} +#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c new file mode 100644 index 000000000..3b9d3a4b4 --- /dev/null +++ b/mm/memory_hotplug.c @@ -0,0 +1,2282 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" +#include "shuffle.h" + +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +/* + * memory_hotplug.memmap_on_memory parameter + */ +static bool memmap_on_memory __ro_after_init; +module_param(memmap_on_memory, bool, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); + +static inline bool mhp_memmap_on_memory(void) +{ + return memmap_on_memory; +} +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} +#endif + +enum { + ONLINE_POLICY_CONTIG_ZONES = 0, + ONLINE_POLICY_AUTO_MOVABLE, +}; + +static const char * const online_policy_to_str[] = { + [ONLINE_POLICY_CONTIG_ZONES] = "contig-zones", + [ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable", +}; + +static int set_online_policy(const char *val, const struct kernel_param *kp) +{ + int ret = sysfs_match_string(online_policy_to_str, val); + + if (ret < 0) + return ret; + *((int *)kp->arg) = ret; + return 0; +} + +static int get_online_policy(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%s\n", online_policy_to_str[*((int *)kp->arg)]); +} + +/* + * memory_hotplug.online_policy: configure online behavior when onlining without + * specifying a zone (MMOP_ONLINE) + * + * "contig-zones": keep zone contiguous + * "auto-movable": online memory to ZONE_MOVABLE if the configuration + * (auto_movable_ratio, auto_movable_numa_aware) allows for it + */ +static int online_policy __read_mostly = ONLINE_POLICY_CONTIG_ZONES; +static const struct kernel_param_ops online_policy_ops = { + .set = set_online_policy, + .get = get_online_policy, +}; +module_param_cb(online_policy, &online_policy_ops, &online_policy, 0644); +MODULE_PARM_DESC(online_policy, + "Set the online policy (\"contig-zones\", \"auto-movable\") " + "Default: \"contig-zones\""); + +/* + * memory_hotplug.auto_movable_ratio: specify maximum MOVABLE:KERNEL ratio + * + * The ratio represent an upper limit and the kernel might decide to not + * online some memory to ZONE_MOVABLE -- e.g., because hotplugged KERNEL memory + * doesn't allow for more MOVABLE memory. + */ +static unsigned int auto_movable_ratio __read_mostly = 301; +module_param(auto_movable_ratio, uint, 0644); +MODULE_PARM_DESC(auto_movable_ratio, + "Set the maximum ratio of MOVABLE:KERNEL memory in the system " + "in percent for \"auto-movable\" online policy. Default: 301"); + +/* + * memory_hotplug.auto_movable_numa_aware: consider numa node stats + */ +#ifdef CONFIG_NUMA +static bool auto_movable_numa_aware __read_mostly = true; +module_param(auto_movable_numa_aware, bool, 0644); +MODULE_PARM_DESC(auto_movable_numa_aware, + "Consider numa node stats in addition to global stats in " + "\"auto-movable\" online policy. Default: true"); +#endif /* CONFIG_NUMA */ + +/* + * online_page_callback contains pointer to current page onlining function. + * Initially it is generic_online_page(). If it is required it could be + * changed by calling set_online_page_callback() for callback registration + * and restore_online_page_callback() for generic callback restore. + */ + +static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); + +DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); + +void get_online_mems(void) +{ + percpu_down_read(&mem_hotplug_lock); +} + +void put_online_mems(void) +{ + percpu_up_read(&mem_hotplug_lock); +} + +bool movable_node_enabled = false; + +#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE +int mhp_default_online_type = MMOP_OFFLINE; +#else +int mhp_default_online_type = MMOP_ONLINE; +#endif + +static int __init setup_memhp_default_state(char *str) +{ + const int online_type = mhp_online_type_from_str(str); + + if (online_type >= 0) + mhp_default_online_type = online_type; + + return 1; +} +__setup("memhp_default_state=", setup_memhp_default_state); + +void mem_hotplug_begin(void) +{ + cpus_read_lock(); + percpu_down_write(&mem_hotplug_lock); +} + +void mem_hotplug_done(void) +{ + percpu_up_write(&mem_hotplug_lock); + cpus_read_unlock(); +} + +u64 max_mem_size = U64_MAX; + +/* add this memory to iomem resource */ +static struct resource *register_memory_resource(u64 start, u64 size, + const char *resource_name) +{ + struct resource *res; + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + if (strcmp(resource_name, "System RAM")) + flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED; + + if (!mhp_range_allowed(start, size, true)) + return ERR_PTR(-E2BIG); + + /* + * Make sure value parsed from 'mem=' only restricts memory adding + * while booting, so that memory hotplug won't be impacted. Please + * refer to document of 'mem=' in kernel-parameters.txt for more + * details. + */ + if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) + return ERR_PTR(-E2BIG); + + /* + * Request ownership of the new memory range. This might be + * a child of an existing resource that was present but + * not marked as busy. + */ + res = __request_region(&iomem_resource, start, size, + resource_name, flags); + + if (!res) { + pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n", + start, start + size); + return ERR_PTR(-EEXIST); + } + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); +} + +static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) +{ + /* + * Disallow all operations smaller than a sub-section and only + * allow operations smaller than a section for + * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() + * enforces a larger memory_block_size_bytes() granularity for + * memory that will be marked online, so this check should only + * fire for direct arch_{add,remove}_memory() users outside of + * add_memory_resource(). + */ + unsigned long min_align; + + if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + min_align = PAGES_PER_SUBSECTION; + else + min_align = PAGES_PER_SECTION; + if (!IS_ALIGNED(pfn | nr_pages, min_align)) + return -EINVAL; + return 0; +} + +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +struct page *pfn_to_online_page(unsigned long pfn) +{ + unsigned long nr = pfn_to_section_nr(pfn); + struct dev_pagemap *pgmap; + struct mem_section *ms; + + if (nr >= NR_MEM_SECTIONS) + return NULL; + + ms = __nr_to_section(nr); + if (!online_section(ms)) + return NULL; + + /* + * Save some code text when online_section() + + * pfn_section_valid() are sufficient. + */ + if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn)) + return NULL; + + if (!pfn_section_valid(ms, pfn)) + return NULL; + + if (!online_device_section(ms)) + return pfn_to_page(pfn); + + /* + * Slowpath: when ZONE_DEVICE collides with + * ZONE_{NORMAL,MOVABLE} within the same section some pfns in + * the section may be 'offline' but 'valid'. Only + * get_dev_pagemap() can determine sub-section online status. + */ + pgmap = get_dev_pagemap(pfn, NULL); + put_dev_pagemap(pgmap); + + /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ + if (pgmap) + return NULL; + + return pfn_to_page(pfn); +} +EXPORT_SYMBOL_GPL(pfn_to_online_page); + +int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, + struct mhp_params *params) +{ + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; + int err; + struct vmem_altmap *altmap = params->altmap; + + if (WARN_ON_ONCE(!pgprot_val(params->pgprot))) + return -EINVAL; + + VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false)); + + if (altmap) { + /* + * Validate altmap is within bounds of the total request + */ + if (altmap->base_pfn != pfn + || vmem_altmap_offset(altmap) > nr_pages) { + pr_warn_once("memory add fail, invalid altmap\n"); + return -EINVAL; + } + altmap->alloc = 0; + } + + if (check_pfn_span(pfn, nr_pages)) { + WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + return -EINVAL; + } + + for (; pfn < end_pfn; pfn += cur_nr_pages) { + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); + err = sparse_add_section(nid, pfn, cur_nr_pages, altmap, + params->pgmap); + if (err) + break; + cond_resched(); + } + vmemmap_populate_print_last(); + return err; +} + +/* find the smallest valid pfn in the range [start_pfn, end_pfn) */ +static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_to_online_page(start_pfn))) + continue; + + if (unlikely(pfn_to_nid(start_pfn) != nid)) + continue; + + if (zone != page_zone(pfn_to_page(start_pfn))) + continue; + + return start_pfn; + } + + return 0; +} + +/* find the biggest valid pfn in the range [start_pfn, end_pfn). */ +static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + + /* pfn is the end pfn of a memory section. */ + pfn = end_pfn - 1; + for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_to_online_page(pfn))) + continue; + + if (unlikely(pfn_to_nid(pfn) != nid)) + continue; + + if (zone != page_zone(pfn_to_page(pfn))) + continue; + + return pfn; + } + + return 0; +} + +static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + int nid = zone_to_nid(zone); + + if (zone->zone_start_pfn == start_pfn) { + /* + * If the section is smallest section in the zone, it need + * shrink zone->zone_start_pfn and zone->zone_spanned_pages. + * In this case, we find second smallest valid mem_section + * for shrinking zone. + */ + pfn = find_smallest_section_pfn(nid, zone, end_pfn, + zone_end_pfn(zone)); + if (pfn) { + zone->spanned_pages = zone_end_pfn(zone) - pfn; + zone->zone_start_pfn = pfn; + } else { + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } + } else if (zone_end_pfn(zone) == end_pfn) { + /* + * If the section is biggest section in the zone, it need + * shrink zone->spanned_pages. + * In this case, we find second biggest valid mem_section for + * shrinking zone. + */ + pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, + start_pfn); + if (pfn) + zone->spanned_pages = pfn - zone->zone_start_pfn + 1; + else { + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } + } +} + +static void update_pgdat_span(struct pglist_data *pgdat) +{ + unsigned long node_start_pfn = 0, node_end_pfn = 0; + struct zone *zone; + + for (zone = pgdat->node_zones; + zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { + unsigned long end_pfn = zone_end_pfn(zone); + + /* No need to lock the zones, they can't change. */ + if (!zone->spanned_pages) + continue; + if (!node_end_pfn) { + node_start_pfn = zone->zone_start_pfn; + node_end_pfn = end_pfn; + continue; + } + + if (end_pfn > node_end_pfn) + node_end_pfn = end_pfn; + if (zone->zone_start_pfn < node_start_pfn) + node_start_pfn = zone->zone_start_pfn; + } + + pgdat->node_start_pfn = node_start_pfn; + pgdat->node_spanned_pages = node_end_pfn - node_start_pfn; +} + +void __ref remove_pfn_range_from_zone(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages) +{ + const unsigned long end_pfn = start_pfn + nr_pages; + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long pfn, cur_nr_pages; + + /* Poison struct pages because they are now uninitialized again. */ + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } + + /* + * Zone shrinking code cannot properly deal with ZONE_DEVICE. So + * we will not try to shrink the zones - which is okay as + * set_zone_contiguous() cannot deal with ZONE_DEVICE either way. + */ + if (zone_is_zone_device(zone)) + return; + + clear_zone_contiguous(zone); + + shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); + update_pgdat_span(pgdat); + + set_zone_contiguous(zone); +} + +static void __remove_section(unsigned long pfn, unsigned long nr_pages, + unsigned long map_offset, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + if (WARN_ON_ONCE(!valid_section(ms))) + return; + + sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); +} + +/** + * __remove_pages() - remove sections of pages + * @pfn: starting pageframe (must be aligned to start of a section) + * @nr_pages: number of pages to remove (must be multiple of section size) + * @altmap: alternative device page map or %NULL if default memmap is used + * + * Generic helper function to remove section mappings and sysfs entries + * for the section of the memory we are removing. Caller needs to make + * sure that pages are marked reserved and zones are adjust properly by + * calling offline_pages(). + */ +void __remove_pages(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; + unsigned long map_offset = 0; + + map_offset = vmem_altmap_offset(altmap); + + if (check_pfn_span(pfn, nr_pages)) { + WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + return; + } + + for (; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); + __remove_section(pfn, cur_nr_pages, map_offset, altmap); + map_offset = 0; + } +} + +int set_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + get_online_mems(); + mutex_lock(&online_page_callback_lock); + + if (online_page_callback == generic_online_page) { + online_page_callback = callback; + rc = 0; + } + + mutex_unlock(&online_page_callback_lock); + put_online_mems(); + + return rc; +} +EXPORT_SYMBOL_GPL(set_online_page_callback); + +int restore_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + get_online_mems(); + mutex_lock(&online_page_callback_lock); + + if (online_page_callback == callback) { + online_page_callback = generic_online_page; + rc = 0; + } + + mutex_unlock(&online_page_callback_lock); + put_online_mems(); + + return rc; +} +EXPORT_SYMBOL_GPL(restore_online_page_callback); + +void generic_online_page(struct page *page, unsigned int order) +{ + /* + * Freeing the page with debug_pagealloc enabled will try to unmap it, + * so we should map it first. This is better than introducing a special + * case in page freeing fast path. + */ + debug_pagealloc_map_pages(page, 1 << order); + __free_pages_core(page, order); + totalram_pages_add(1UL << order); +} +EXPORT_SYMBOL_GPL(generic_online_page); + +static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) +{ + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might + * decide to not expose all pages to the buddy (e.g., expose them + * later). We account all pages as being online and belonging to this + * zone ("present"). + * When using memmap_on_memory, the range might not be aligned to + * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect + * this and the first chunk to online will be pageblock_nr_pages. + */ + for (pfn = start_pfn; pfn < end_pfn;) { + int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + + (*online_page_callback)(pfn_to_page(pfn), order); + pfn += (1UL << order); + } + + /* mark all involved sections as online */ + online_mem_sections(start_pfn, end_pfn); +} + +/* check which state of node_states will be changed when online memory */ +static void node_states_check_changes_online(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + int nid = zone_to_nid(zone); + + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + + if (!node_state(nid, N_MEMORY)) + arg->status_change_nid = nid; + if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; +} + +static void node_states_set_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_set_state(node, N_NORMAL_MEMORY); + + if (arg->status_change_nid >= 0) + node_set_state(node, N_MEMORY); +} + +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); + + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); + + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; + +} + +#ifdef CONFIG_ZONE_DEVICE +static void section_taint_zone_device(unsigned long pfn) +{ + struct mem_section *ms = __pfn_to_section(pfn); + + ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; +} +#else +static inline void section_taint_zone_device(unsigned long pfn) +{ +} +#endif + +/* + * Associate the pfn range with the given zone, initializing the memmaps + * and resizing the pgdat/zone data to span the added pages. After this + * call, all affected pages are PG_reserved. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. + */ +void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, + struct vmem_altmap *altmap, int migratetype) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + + clear_zone_contiguous(zone); + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + resize_zone_range(zone, start_pfn, nr_pages); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + + /* + * Subsection population requires care in pfn_to_online_page(). + * Set the taint to enable the slow path detection of + * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE} + * section. + */ + if (zone_is_zone_device(zone)) { + if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn); + if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)) + section_taint_zone_device(start_pfn + nr_pages); + } + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, + MEMINIT_HOTPLUG, altmap, migratetype); + + set_zone_contiguous(zone); +} + +struct auto_movable_stats { + unsigned long kernel_early_pages; + unsigned long movable_pages; +}; + +static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, + struct zone *zone) +{ + if (zone_idx(zone) == ZONE_MOVABLE) { + stats->movable_pages += zone->present_pages; + } else { + stats->kernel_early_pages += zone->present_early_pages; +#ifdef CONFIG_CMA + /* + * CMA pages (never on hotplugged memory) behave like + * ZONE_MOVABLE. + */ + stats->movable_pages += zone->cma_pages; + stats->kernel_early_pages -= zone->cma_pages; +#endif /* CONFIG_CMA */ + } +} +struct auto_movable_group_stats { + unsigned long movable_pages; + unsigned long req_kernel_early_pages; +}; + +static int auto_movable_stats_account_group(struct memory_group *group, + void *arg) +{ + const int ratio = READ_ONCE(auto_movable_ratio); + struct auto_movable_group_stats *stats = arg; + long pages; + + /* + * We don't support modifying the config while the auto-movable online + * policy is already enabled. Just avoid the division by zero below. + */ + if (!ratio) + return 0; + + /* + * Calculate how many early kernel pages this group requires to + * satisfy the configured zone ratio. + */ + pages = group->present_movable_pages * 100 / ratio; + pages -= group->present_kernel_pages; + + if (pages > 0) + stats->req_kernel_early_pages += pages; + stats->movable_pages += group->present_movable_pages; + return 0; +} + +static bool auto_movable_can_online_movable(int nid, struct memory_group *group, + unsigned long nr_pages) +{ + unsigned long kernel_early_pages, movable_pages; + struct auto_movable_group_stats group_stats = {}; + struct auto_movable_stats stats = {}; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + int i; + + /* Walk all relevant zones and collect MOVABLE vs. KERNEL stats. */ + if (nid == NUMA_NO_NODE) { + /* TODO: cache values */ + for_each_populated_zone(zone) + auto_movable_stats_account_zone(&stats, zone); + } else { + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (populated_zone(zone)) + auto_movable_stats_account_zone(&stats, zone); + } + } + + kernel_early_pages = stats.kernel_early_pages; + movable_pages = stats.movable_pages; + + /* + * Kernel memory inside dynamic memory group allows for more MOVABLE + * memory within the same group. Remove the effect of all but the + * current group from the stats. + */ + walk_dynamic_memory_groups(nid, auto_movable_stats_account_group, + group, &group_stats); + if (kernel_early_pages <= group_stats.req_kernel_early_pages) + return false; + kernel_early_pages -= group_stats.req_kernel_early_pages; + movable_pages -= group_stats.movable_pages; + + if (group && group->is_dynamic) + kernel_early_pages += group->present_kernel_pages; + + /* + * Test if we could online the given number of pages to ZONE_MOVABLE + * and still stay in the configured ratio. + */ + movable_pages += nr_pages; + return movable_pages <= (auto_movable_ratio * kernel_early_pages) / 100; +} + +/* + * Returns a default kernel memory zone for the given pfn range. + * If no kernel zone covers this pfn range it will automatically go + * to the ZONE_NORMAL. + */ +static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int zid; + + for (zid = 0; zid < ZONE_NORMAL; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_intersects(zone, start_pfn, nr_pages)) + return zone; + } + + return &pgdat->node_zones[ZONE_NORMAL]; +} + +/* + * Determine to which zone to online memory dynamically based on user + * configuration and system stats. We care about the following ratio: + * + * MOVABLE : KERNEL + * + * Whereby MOVABLE is memory in ZONE_MOVABLE and KERNEL is memory in + * one of the kernel zones. CMA pages inside one of the kernel zones really + * behaves like ZONE_MOVABLE, so we treat them accordingly. + * + * We don't allow for hotplugged memory in a KERNEL zone to increase the + * amount of MOVABLE memory we can have, so we end up with: + * + * MOVABLE : KERNEL_EARLY + * + * Whereby KERNEL_EARLY is memory in one of the kernel zones, available sinze + * boot. We base our calculation on KERNEL_EARLY internally, because: + * + * a) Hotplugged memory in one of the kernel zones can sometimes still get + * hotunplugged, especially when hot(un)plugging individual memory blocks. + * There is no coordination across memory devices, therefore "automatic" + * hotunplugging, as implemented in hypervisors, could result in zone + * imbalances. + * b) Early/boot memory in one of the kernel zones can usually not get + * hotunplugged again (e.g., no firmware interface to unplug, fragmented + * with unmovable allocations). While there are corner cases where it might + * still work, it is barely relevant in practice. + * + * Exceptions are dynamic memory groups, which allow for more MOVABLE + * memory within the same memory group -- because in that case, there is + * coordination within the single memory device managed by a single driver. + * + * We rely on "present pages" instead of "managed pages", as the latter is + * highly unreliable and dynamic in virtualized environments, and does not + * consider boot time allocations. For example, memory ballooning adjusts the + * managed pages when inflating/deflating the balloon, and balloon compaction + * can even migrate inflated pages between zones. + * + * Using "present pages" is better but some things to keep in mind are: + * + * a) Some memblock allocations, such as for the crashkernel area, are + * effectively unused by the kernel, yet they account to "present pages". + * Fortunately, these allocations are comparatively small in relevant setups + * (e.g., fraction of system memory). + * b) Some hotplugged memory blocks in virtualized environments, esecially + * hotplugged by virtio-mem, look like they are completely present, however, + * only parts of the memory block are actually currently usable. + * "present pages" is an upper limit that can get reached at runtime. As + * we base our calculations on KERNEL_EARLY, this is not an issue. + */ +static struct zone *auto_movable_zone_for_pfn(int nid, + struct memory_group *group, + unsigned long pfn, + unsigned long nr_pages) +{ + unsigned long online_pages = 0, max_pages, end_pfn; + struct page *page; + + if (!auto_movable_ratio) + goto kernel_zone; + + if (group && !group->is_dynamic) { + max_pages = group->s.max_pages; + online_pages = group->present_movable_pages; + + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (group->present_kernel_pages) + goto kernel_zone; + } else if (!group || group->d.unit_pages == nr_pages) { + max_pages = nr_pages; + } else { + max_pages = group->d.unit_pages; + /* + * Take a look at all online sections in the current unit. + * We can safely assume that all pages within a section belong + * to the same zone, because dynamic memory groups only deal + * with hotplugged memory. + */ + pfn = ALIGN_DOWN(pfn, group->d.unit_pages); + end_pfn = pfn + group->d.unit_pages; + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + page = pfn_to_online_page(pfn); + if (!page) + continue; + /* If anything is !MOVABLE online the rest !MOVABLE. */ + if (!is_zone_movable_page(page)) + goto kernel_zone; + online_pages += PAGES_PER_SECTION; + } + } + + /* + * Online MOVABLE if we could *currently* online all remaining parts + * MOVABLE. We expect to (add+) online them immediately next, so if + * nobody interferes, all will be MOVABLE if possible. + */ + nr_pages = max_pages - online_pages; + if (!auto_movable_can_online_movable(NUMA_NO_NODE, group, nr_pages)) + goto kernel_zone; + +#ifdef CONFIG_NUMA + if (auto_movable_numa_aware && + !auto_movable_can_online_movable(nid, group, nr_pages)) + goto kernel_zone; +#endif /* CONFIG_NUMA */ + + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; +kernel_zone: + return default_kernel_zone_for_pfn(nid, pfn, nr_pages); +} + +static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, + nr_pages); + struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); + bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); + + /* + * We inherit the existing zone in a simple case where zones do not + * overlap in the given range + */ + if (in_kernel ^ in_movable) + return (in_kernel) ? kernel_zone : movable_zone; + + /* + * If the range doesn't belong to any zone or two zones overlap in the + * given range then we use movable zone only if movable_node is + * enabled because we always online to a kernel zone by default. + */ + return movable_node_enabled ? movable_zone : kernel_zone; +} + +struct zone *zone_for_pfn_range(int online_type, int nid, + struct memory_group *group, unsigned long start_pfn, + unsigned long nr_pages) +{ + if (online_type == MMOP_ONLINE_KERNEL) + return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); + + if (online_type == MMOP_ONLINE_MOVABLE) + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + + if (online_policy == ONLINE_POLICY_AUTO_MOVABLE) + return auto_movable_zone_for_pfn(nid, group, start_pfn, nr_pages); + + return default_zone_for_pfn(nid, start_pfn, nr_pages); +} + +/* + * This function should only be called by memory_block_{online,offline}, + * and {online,offline}_pages. + */ +void adjust_present_page_count(struct page *page, struct memory_group *group, + long nr_pages) +{ + struct zone *zone = page_zone(page); + const bool movable = zone_idx(zone) == ZONE_MOVABLE; + + /* + * We only support onlining/offlining/adding/removing of complete + * memory blocks; therefore, either all is either early or hotplugged. + */ + if (early_section(__pfn_to_section(page_to_pfn(page)))) + zone->present_early_pages += nr_pages; + zone->present_pages += nr_pages; + zone->zone_pgdat->node_present_pages += nr_pages; + + if (group && movable) + group->present_movable_pages += nr_pages; + else if (group && !movable) + group->present_kernel_pages += nr_pages; +} + +int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone) +{ + unsigned long end_pfn = pfn + nr_pages; + int ret, i; + + ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + if (ret) + return ret; + + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + + for (i = 0; i < nr_pages; i++) + SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections online here as otherwise they will be + * left offline. + */ + if (nr_pages >= PAGES_PER_SECTION) + online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + return ret; +} + +void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long end_pfn = pfn + nr_pages; + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections offline here as otherwise they will be + * left online. + */ + if (nr_pages >= PAGES_PER_SECTION) + offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + /* + * The pages associated with this vmemmap have been offlined, so + * we can reset its state here. + */ + remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); +} + +/* + * Must be called with mem_hotplug_lock in write mode. + */ +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) +{ + unsigned long flags; + int need_zonelists_rebuild = 0; + const int nid = zone_to_nid(zone); + int ret; + struct memory_notify arg; + + /* + * {on,off}lining is constrained to full memory sections (or more + * precisely to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + + + /* associate pfn range with the zone */ + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); + + arg.start_pfn = pfn; + arg.nr_pages = nr_pages; + node_states_check_changes_online(nr_pages, zone, &arg); + + ret = memory_notify(MEM_GOING_ONLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) + goto failed_addition; + + /* + * Fixup the number of isolated pageblocks before marking the sections + * onlining, such that undo_isolate_page_range() works correctly. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages; + spin_unlock_irqrestore(&zone->lock, flags); + + /* + * If this zone is not populated, then it is not in zonelist. + * This means the page allocator ignores this zone. + * So, zonelist must be updated after online. + */ + if (!populated_zone(zone)) { + need_zonelists_rebuild = 1; + setup_zone_pageset(zone); + } + + online_pages_range(pfn, nr_pages); + adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); + + node_states_set_node(nid, &arg); + if (need_zonelists_rebuild) + build_all_zonelists(NULL); + + /* Basic onlining is complete, allow allocation of onlined pages. */ + undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE); + + /* + * Freshly onlined pages aren't shuffled (e.g., all pages are placed to + * the tail of the freelist when undoing isolation). Shuffle the whole + * zone to make sure the just onlined pages are properly distributed + * across the whole freelist - to create an initial shuffle. + */ + shuffle_zone(zone); + + /* reinitialise watermarks and update pcp limits */ + init_per_zone_wmark_min(); + + kswapd_run(nid); + kcompactd_run(nid); + + writeback_set_ratelimit(); + + memory_notify(MEM_ONLINE, &arg); + return 0; + +failed_addition: + pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + remove_pfn_range_from_zone(zone, pfn, nr_pages); + return ret; +} + +static void reset_node_present_pages(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + z->present_pages = 0; + + pgdat->node_present_pages = 0; +} + +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +static pg_data_t __ref *hotadd_init_pgdat(int nid) +{ + struct pglist_data *pgdat; + + /* + * NODE_DATA is preallocated (free_area_init) but its internal + * state is not allocated completely. Add missing pieces. + * Completely offline nodes stay around and they just need + * reintialization. + */ + pgdat = NODE_DATA(nid); + + /* init node's zones as empty zones, we don't have any present pages.*/ + free_area_init_core_hotplug(pgdat); + + /* + * The node we allocated has no zone fallback lists. For avoiding + * to access not-initialized zonelist, build here. + */ + build_all_zonelists(pgdat); + + /* + * When memory is hot-added, all the memory is in offline state. So + * clear all zones' present_pages because they will be updated in + * online_pages() and offline_pages(). + * TODO: should be in free_area_init_core_hotplug? + */ + reset_node_managed_pages(pgdat); + reset_node_present_pages(pgdat); + + return pgdat; +} + +/* + * __try_online_node - online a node if offlined + * @nid: the node ID + * @set_node_online: Whether we want to online the node + * called by cpu_up() to online a node without onlined memory. + * + * Returns: + * 1 -> a new node has been allocated + * 0 -> the node is already online + * -ENOMEM -> the node could not be allocated + */ +static int __try_online_node(int nid, bool set_node_online) +{ + pg_data_t *pgdat; + int ret = 1; + + if (node_online(nid)) + return 0; + + pgdat = hotadd_init_pgdat(nid); + if (!pgdat) { + pr_err("Cannot online node %d due to NULL pgdat\n", nid); + ret = -ENOMEM; + goto out; + } + + if (set_node_online) { + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + } +out: + return ret; +} + +/* + * Users of this function always want to online/register the node + */ +int try_online_node(int nid) +{ + int ret; + + mem_hotplug_begin(); + ret = __try_online_node(nid, true); + mem_hotplug_done(); + return ret; +} + +static int check_hotplug_memory_range(u64 start, u64 size) +{ + /* memory range must be block size aligned */ + if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes())) { + pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", + memory_block_size_bytes(), start, size); + return -EINVAL; + } + + return 0; +} + +static int online_memory_block(struct memory_block *mem, void *arg) +{ + mem->online_type = mhp_default_online_type; + return device_online(&mem->dev); +} + +bool mhp_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long remaining_size = size - vmemmap_size; + + /* + * Besides having arch support and the feature enabled at runtime, we + * need a few more assumptions to hold true: + * + * a) We span a single memory block: memory onlining/offlinin;g happens + * in memory block granularity. We don't want the vmemmap of online + * memory blocks to reside on offline memory blocks. In the future, + * we might want to support variable-sized memory blocks to make the + * feature more versatile. + * + * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * to populate memory from the altmap for unrelated parts (i.e., + * other memory blocks) + * + * c) The vmemmap pages (and thereby the pages that will be exposed to + * the buddy) have to cover full pageblocks: memory onlining/offlining + * code requires applicable ranges to be page-aligned, for example, to + * set the migratetypes properly. + * + * TODO: Although we have a check here to make sure that vmemmap pages + * fully populate a PMD, it is not the right place to check for + * this. A much better solution involves improving vmemmap code + * to fallback to base pages when trying to populate vmemmap using + * altmap as an alternative source of memory, and we do not exactly + * populate a single PMD. + */ + return mhp_memmap_on_memory() && + size == memory_block_size_bytes() && + IS_ALIGNED(vmemmap_size, PMD_SIZE) && + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); +} + +/* + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations (triggered e.g. by sysfs). + * + * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG + */ +int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) +{ + struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + enum memblock_flags memblock_flags = MEMBLOCK_NONE; + struct vmem_altmap mhp_altmap = {}; + struct memory_group *group = NULL; + u64 start, size; + bool new_node = false; + int ret; + + start = res->start; + size = resource_size(res); + + ret = check_hotplug_memory_range(start, size); + if (ret) + return ret; + + if (mhp_flags & MHP_NID_IS_MGID) { + group = memory_group_find_by_id(nid); + if (!group) + return -EINVAL; + nid = group->nid; + } + + if (!node_possible(nid)) { + WARN(1, "node %d was absent from the node_possible_map\n", nid); + return -EINVAL; + } + + mem_hotplug_begin(); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED) + memblock_flags = MEMBLOCK_DRIVER_MANAGED; + ret = memblock_add_node(start, size, nid, memblock_flags); + if (ret) + goto error_mem_hotplug_end; + } + + ret = __try_online_node(nid, false); + if (ret < 0) + goto error; + new_node = ret; + + /* + * Self hosted memmap array + */ + if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if (!mhp_supports_memmap_on_memory(size)) { + ret = -EINVAL; + goto error; + } + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; + } + + /* call arch's memory hotadd */ + ret = arch_add_memory(nid, start, size, ¶ms); + if (ret < 0) + goto error; + + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(start, size, mhp_altmap.alloc, + group); + if (ret) { + arch_remove_memory(start, size, params.altmap); + goto error; + } + + if (new_node) { + /* If sysfs file of new node can't be created, cpu on the node + * can't be hot-added. There is no rollback way now. + * So, check by BUG_ON() to catch it reluctantly.. + * We online node here. We can't roll back from here. + */ + node_set_online(nid); + ret = __register_one_node(nid); + BUG_ON(ret); + } + + register_memory_blocks_under_node(nid, PFN_DOWN(start), + PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); + + /* create new memmap entry */ + if (!strcmp(res->name, "System RAM")) + firmware_map_add_hotplug(start, start + size, "System RAM"); + + /* device_online() will take the lock when calling online_pages() */ + mem_hotplug_done(); + + /* + * In case we're allowed to merge the resource, flag it and trigger + * merging now that adding succeeded. + */ + if (mhp_flags & MHP_MERGE_RESOURCE) + merge_system_ram_resource(res); + + /* online pages if requested */ + if (mhp_default_online_type != MMOP_OFFLINE) + walk_memory_blocks(start, size, NULL, online_memory_block); + + return ret; +error: + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) + memblock_remove(start, size); +error_mem_hotplug_end: + mem_hotplug_done(); + return ret; +} + +/* requires device_hotplug_lock, see add_memory_resource() */ +int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) +{ + struct resource *res; + int ret; + + res = register_memory_resource(start, size, "System RAM"); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = add_memory_resource(nid, res, mhp_flags); + if (ret < 0) + release_memory_resource(res); + return ret; +} + +int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags) +{ + int rc; + + lock_device_hotplug(); + rc = __add_memory(nid, start, size, mhp_flags); + unlock_device_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(add_memory); + +/* + * Add special, driver-managed memory to the system as system RAM. Such + * memory is not exposed via the raw firmware-provided memmap as system + * RAM, instead, it is detected and added by a driver - during cold boot, + * after a reboot, and after kexec. + * + * Reasons why this memory should not be used for the initial memmap of a + * kexec kernel or for placing kexec images: + * - The booting kernel is in charge of determining how this memory will be + * used (e.g., use persistent memory as system RAM) + * - Coordination with a hypervisor is required before this memory + * can be used (e.g., inaccessible parts). + * + * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided + * memory map") are created. Also, the created memory resource is flagged + * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case + * this memory as well (esp., not place kexec images onto it). + * + * The resource_name (visible via /proc/iomem) has to have the format + * "System RAM ($DRIVER)". + */ +int add_memory_driver_managed(int nid, u64 start, u64 size, + const char *resource_name, mhp_t mhp_flags) +{ + struct resource *res; + int rc; + + if (!resource_name || + strstr(resource_name, "System RAM (") != resource_name || + resource_name[strlen(resource_name) - 1] != ')') + return -EINVAL; + + lock_device_hotplug(); + + res = register_memory_resource(start, size, resource_name); + if (IS_ERR(res)) { + rc = PTR_ERR(res); + goto out_unlock; + } + + rc = add_memory_resource(nid, res, mhp_flags); + if (rc < 0) + release_memory_resource(res); + +out_unlock: + unlock_device_hotplug(); + return rc; +} +EXPORT_SYMBOL_GPL(add_memory_driver_managed); + +/* + * Platforms should define arch_get_mappable_range() that provides + * maximum possible addressable physical memory range for which the + * linear mapping could be created. The platform returned address + * range must adhere to these following semantics. + * + * - range.start <= range.end + * - Range includes both end points [range.start..range.end] + * + * There is also a fallback definition provided here, allowing the + * entire possible physical address range in case any platform does + * not define arch_get_mappable_range(). + */ +struct range __weak arch_get_mappable_range(void) +{ + struct range mhp_range = { + .start = 0UL, + .end = -1ULL, + }; + return mhp_range; +} + +struct range mhp_get_pluggable_range(bool need_mapping) +{ + const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1; + struct range mhp_range; + + if (need_mapping) { + mhp_range = arch_get_mappable_range(); + if (mhp_range.start > max_phys) { + mhp_range.start = 0; + mhp_range.end = 0; + } + mhp_range.end = min_t(u64, mhp_range.end, max_phys); + } else { + mhp_range.start = 0; + mhp_range.end = max_phys; + } + return mhp_range; +} +EXPORT_SYMBOL_GPL(mhp_get_pluggable_range); + +bool mhp_range_allowed(u64 start, u64 size, bool need_mapping) +{ + struct range mhp_range = mhp_get_pluggable_range(need_mapping); + u64 end = start + size; + + if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end) + return true; + + pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n", + start, end, mhp_range.start, mhp_range.end); + return false; +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages, + * non-lru movable pages and hugepages). Will skip over most unmovable + * pages (esp., pages that can be skipped when offlining), but bail out on + * definitely unmovable pages. + * + * Returns: + * 0 in case a movable page is found and movable_pfn was updated. + * -ENOENT in case no movable page was found. + * -EBUSY in case a definitely unmovable page was found. + */ +static int scan_movable_pages(unsigned long start, unsigned long end, + unsigned long *movable_pfn) +{ + unsigned long pfn; + + for (pfn = start; pfn < end; pfn++) { + struct page *page, *head; + unsigned long skip; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageLRU(page)) + goto found; + if (__PageMovable(page)) + goto found; + + /* + * PageOffline() pages that are not marked __PageMovable() and + * have a reference count > 0 (after MEM_GOING_OFFLINE) are + * definitely unmovable. If their reference count would be 0, + * they could at least be skipped when offlining memory. + */ + if (PageOffline(page) && page_count(page)) + return -EBUSY; + + if (!PageHuge(page)) + continue; + head = compound_head(page); + /* + * This test is racy as we hold no reference or lock. The + * hugetlb page could have been free'ed and head is no longer + * a hugetlb page before the following check. In such unlikely + * cases false positives and negatives are possible. Calling + * code must deal with these scenarios. + */ + if (HPageMigratable(head)) + goto found; + skip = compound_nr(head) - (pfn - page_to_pfn(head)); + pfn += skip - 1; + } + return -ENOENT; +found: + *movable_pfn = pfn; + return 0; +} + +static int +do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + struct page *page, *head; + int ret = 0; + LIST_HEAD(source); + static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + struct folio *folio; + + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + folio = page_folio(page); + head = &folio->page; + + if (PageHuge(page)) { + pfn = page_to_pfn(head) + compound_nr(head) - 1; + isolate_hugetlb(head, &source); + continue; + } else if (PageTransHuge(page)) + pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; + + /* + * HWPoison pages have elevated reference counts so the migration would + * fail on them. It also doesn't make any sense to migrate them in the + * first place. Still try to unmap such a page in case it is still mapped + * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep + * the unmap as the catch all safety net). + */ + if (PageHWPoison(page)) { + if (WARN_ON(folio_test_lru(folio))) + folio_isolate_lru(folio); + if (folio_mapped(folio)) + try_to_unmap(folio, TTU_IGNORE_MLOCK); + continue; + } + + if (!get_page_unless_zero(page)) + continue; + /* + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. + */ + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); + if (!ret) { /* Success */ + list_add_tail(&page->lru, &source); + if (!__PageMovable(page)) + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + + } else { + if (__ratelimit(&migrate_rs)) { + pr_warn("failed to isolate pfn %lx\n", pfn); + dump_page(page, "isolation failed"); + } + } + put_page(page); + } + if (!list_empty(&source)) { + nodemask_t nmask = node_states[N_MEMORY]; + struct migration_target_control mtc = { + .nmask = &nmask, + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + /* + * We have checked that migration range is on a single zone so + * we can use the nid of the first page to all the others. + */ + mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru)); + + /* + * try to allocate from a different node but reuse this node + * if there are no other online nodes to be used (e.g. we are + * offlining a part of the only existing node) + */ + node_clear(mtc.nid, nmask); + if (nodes_empty(nmask)) + node_set(mtc.nid, nmask); + ret = migrate_pages(&source, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL); + if (ret) { + list_for_each_entry(page, &source, lru) { + if (__ratelimit(&migrate_rs)) { + pr_warn("migrating pfn %lx failed ret:%d\n", + page_to_pfn(page), ret); + dump_page(page, "migration failure"); + } + } + putback_movable_pages(&source); + } + } + + return ret; +} + +static int __init cmdline_parse_movable_node(char *p) +{ + movable_node_enabled = true; + return 0; +} +early_param("movable_node", cmdline_parse_movable_node); + +/* check which state of node_states will be changed when offline memory */ +static void node_states_check_changes_offline(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt; + + arg->status_change_nid = NUMA_NO_NODE; + arg->status_change_nid_normal = NUMA_NO_NODE; + + /* + * Check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is within the range + * [0..ZONE_NORMAL], and it is the last present memory there, + * the zones in that range will become empty after the offlining, + * thus we can determine that we need to clear the node from + * node_states[N_NORMAL_MEMORY]. + */ + for (zt = 0; zt <= ZONE_NORMAL; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages) + arg->status_change_nid_normal = zone_to_nid(zone); + + /* + * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM + * does not apply as we don't support 32bit. + * Here we count the possible pages from ZONE_MOVABLE. + * If after having accounted all the pages, we see that the nr_pages + * to be offlined is over or equal to the accounted pages, + * we know that the node will become empty, and so, we can clear + * it for N_MEMORY as well. + */ + present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages; + + if (nr_pages >= present_pages) + arg->status_change_nid = zone_to_nid(zone); +} + +static void node_states_clear_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_clear_state(node, N_NORMAL_MEMORY); + + if (arg->status_change_nid >= 0) + node_clear_state(node, N_MEMORY); +} + +static int count_system_ram_pages_cb(unsigned long start_pfn, + unsigned long nr_pages, void *data) +{ + unsigned long *nr_system_ram_pages = data; + + *nr_system_ram_pages += nr_pages; + return 0; +} + +/* + * Must be called with mem_hotplug_lock in write mode. + */ +int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) +{ + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn, system_ram_pages = 0; + const int node = zone_to_nid(zone); + unsigned long flags; + struct memory_notify arg; + char *reason; + int ret; + + /* + * {on,off}lining is constrained to full memory sections (or more + * precisely to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + + /* + * Don't allow to offline memory blocks that contain holes. + * Consequently, memory blocks with holes can never get onlined + * via the hotplug path - online_pages() - as hotplugged memory has + * no holes. This way, we e.g., don't have to worry about marking + * memory holes PG_reserved, don't need pfn_valid() checks, and can + * avoid using walk_system_ram_range() later. + */ + walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages, + count_system_ram_pages_cb); + if (system_ram_pages != nr_pages) { + ret = -EINVAL; + reason = "memory holes"; + goto failed_removal; + } + + /* + * We only support offlining of memory blocks managed by a single zone, + * checked by calling code. This is just a sanity check that we might + * want to remove in the future. + */ + if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone || + page_zone(pfn_to_page(end_pfn - 1)) != zone)) { + ret = -EINVAL; + reason = "multizone range"; + goto failed_removal; + } + + /* + * Disable pcplists so that page isolation cannot race with freeing + * in a way that pages from isolated pageblock are left on pcplists. + */ + zone_pcp_disable(zone); + lru_cache_disable(); + + /* set above range as isolated */ + ret = start_isolate_page_range(start_pfn, end_pfn, + MIGRATE_MOVABLE, + MEMORY_OFFLINE | REPORT_FAILURE, + GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL); + if (ret) { + reason = "failure to isolate range"; + goto failed_removal_pcplists_disabled; + } + + arg.start_pfn = start_pfn; + arg.nr_pages = nr_pages; + node_states_check_changes_offline(nr_pages, zone, &arg); + + ret = memory_notify(MEM_GOING_OFFLINE, &arg); + ret = notifier_to_errno(ret); + if (ret) { + reason = "notifier failure"; + goto failed_removal_isolated; + } + + do { + pfn = start_pfn; + do { + if (signal_pending(current)) { + ret = -EINTR; + reason = "signal backoff"; + goto failed_removal_isolated; + } + + cond_resched(); + + ret = scan_movable_pages(pfn, end_pfn, &pfn); + if (!ret) { + /* + * TODO: fatal migration failures should bail + * out + */ + do_migrate_range(pfn, end_pfn); + } + } while (!ret); + + if (ret != -ENOENT) { + reason = "unmovable page"; + goto failed_removal_isolated; + } + + /* + * Dissolve free hugepages in the memory block before doing + * offlining actually in order to make hugetlbfs's object + * counting consistent. + */ + ret = dissolve_free_huge_pages(start_pfn, end_pfn); + if (ret) { + reason = "failure to dissolve huge pages"; + goto failed_removal_isolated; + } + + ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); + + } while (ret); + + /* Mark all sections offline and remove free pages from the buddy. */ + __offline_isolated_pages(start_pfn, end_pfn); + pr_debug("Offlined Pages %ld\n", nr_pages); + + /* + * The memory sections are marked offline, and the pageblock flags + * effectively stale; nobody should be touching them. Fixup the number + * of isolated pageblocks, memory onlining will properly revert this. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; + spin_unlock_irqrestore(&zone->lock, flags); + + lru_cache_enable(); + zone_pcp_enable(zone); + + /* removal success */ + adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); + adjust_present_page_count(pfn_to_page(start_pfn), group, -nr_pages); + + /* reinitialise watermarks and update pcp limits */ + init_per_zone_wmark_min(); + + if (!populated_zone(zone)) { + zone_pcp_reset(zone); + build_all_zonelists(NULL); + } + + node_states_clear_node(node, &arg); + if (arg.status_change_nid >= 0) { + kcompactd_stop(node); + kswapd_stop(node); + } + + writeback_set_ratelimit(); + + memory_notify(MEM_OFFLINE, &arg); + remove_pfn_range_from_zone(zone, start_pfn, nr_pages); + return 0; + +failed_removal_isolated: + /* pushback to free area */ + undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + memory_notify(MEM_CANCEL_OFFLINE, &arg); +failed_removal_pcplists_disabled: + lru_cache_enable(); + zone_pcp_enable(zone); +failed_removal: + pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, + reason); + return ret; +} + +static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) +{ + int *nid = arg; + + *nid = mem->nid; + if (unlikely(mem->state != MEM_OFFLINE)) { + phys_addr_t beginpa, endpa; + + beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); + endpa = beginpa + memory_block_size_bytes() - 1; + pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", + &beginpa, &endpa); + + return -EBUSY; + } + return 0; +} + +static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +{ + /* + * If not set, continue with the next block. + */ + return mem->nr_vmemmap_pages; +} + +static int check_cpu_on_node(int nid) +{ + int cpu; + + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == nid) + /* + * the cpu on this node isn't removed, and we can't + * offline this node. + */ + return -EBUSY; + } + + return 0; +} + +static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg) +{ + int nid = *(int *)arg; + + /* + * If a memory block belongs to multiple nodes, the stored nid is not + * reliable. However, such blocks are always online (e.g., cannot get + * offlined) and, therefore, are still spanned by the node. + */ + return mem->nid == nid ? -EEXIST : 0; +} + +/** + * try_offline_node + * @nid: the node ID + * + * Offline a node if all memory sections and cpus of the node are removed. + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call. + */ +void try_offline_node(int nid) +{ + int rc; + + /* + * If the node still spans pages (especially ZONE_DEVICE), don't + * offline it. A node spans memory after move_pfn_range_to_zone(), + * e.g., after the memory block was onlined. + */ + if (node_spanned_pages(nid)) + return; + + /* + * Especially offline memory blocks might not be spanned by the + * node. They will get spanned by the node once they get onlined. + * However, they link to the node in sysfs and can get onlined later. + */ + rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); + if (rc) + return; + + if (check_cpu_on_node(nid)) + return; + + /* + * all memory/cpu of this node are removed, we can offline this + * node now. + */ + node_set_offline(nid); + unregister_one_node(nid); +} +EXPORT_SYMBOL(try_offline_node); + +static int __ref try_remove_memory(u64 start, u64 size) +{ + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + unsigned long nr_vmemmap_pages; + int rc = 0, nid = NUMA_NO_NODE; + + BUG_ON(check_hotplug_memory_range(start, size)); + + /* + * All memory blocks must be offlined before removing memory. Check + * whether all memory blocks in question are offline and return error + * if this is not the case. + * + * While at it, determine the nid. Note that if we'd have mixed nodes, + * we'd only try to offline the last determined one -- which is good + * enough for the cases we care about. + */ + rc = walk_memory_blocks(start, size, &nid, check_memblock_offlined_cb); + if (rc) + return rc; + + /* + * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in + * the same granularity it was added - a single memory block. + */ + if (mhp_memmap_on_memory()) { + nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, + get_nr_vmemmap_pages_cb); + if (nr_vmemmap_pages) { + if (size != memory_block_size_bytes()) { + pr_warn("Refuse to remove %#llx - %#llx," + "wrong granularity\n", + start, start + size); + return -EINVAL; + } + + /* + * Let remove_pmd_table->free_hugepage_table do the + * right thing if we used vmem_altmap when hot-adding + * the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + } + } + + /* remove memmap entry */ + firmware_map_remove(start, start + size, "System RAM"); + + /* + * Memory block device removal under the device_hotplug_lock is + * a barrier against racing online attempts. + */ + remove_memory_block_devices(start, size); + + mem_hotplug_begin(); + + arch_remove_memory(start, size, altmap); + + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { + memblock_phys_free(start, size); + memblock_remove(start, size); + } + + release_mem_region_adjustable(start, size); + + if (nid != NUMA_NO_NODE) + try_offline_node(nid); + + mem_hotplug_done(); + return 0; +} + +/** + * __remove_memory - Remove memory if every memory block is offline + * @start: physical address of the region to remove + * @size: size of the region to remove + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ +void __remove_memory(u64 start, u64 size) +{ + + /* + * trigger BUG() if some memory is not offlined prior to calling this + * function + */ + if (try_remove_memory(start, size)) + BUG(); +} + +/* + * Remove memory if every memory block is offline, otherwise return -EBUSY is + * some memory is not offline + */ +int remove_memory(u64 start, u64 size) +{ + int rc; + + lock_device_hotplug(); + rc = try_remove_memory(start, size); + unlock_device_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(remove_memory); + +static int try_offline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t online_type = MMOP_ONLINE_KERNEL; + uint8_t **online_types = arg; + struct page *page; + int rc; + + /* + * Sense the online_type via the zone of the memory block. Offlining + * with multiple zones within one memory block will be rejected + * by offlining code ... so we don't care about that. + */ + page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr)); + if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE) + online_type = MMOP_ONLINE_MOVABLE; + + rc = device_offline(&mem->dev); + /* + * Default is MMOP_OFFLINE - change it only if offlining succeeded, + * so try_reonline_memory_block() can do the right thing. + */ + if (!rc) + **online_types = online_type; + + (*online_types)++; + /* Ignore if already offline. */ + return rc < 0 ? rc : 0; +} + +static int try_reonline_memory_block(struct memory_block *mem, void *arg) +{ + uint8_t **online_types = arg; + int rc; + + if (**online_types != MMOP_OFFLINE) { + mem->online_type = **online_types; + rc = device_online(&mem->dev); + if (rc < 0) + pr_warn("%s: Failed to re-online memory: %d", + __func__, rc); + } + + /* Continue processing all remaining memory blocks. */ + (*online_types)++; + return 0; +} + +/* + * Try to offline and remove memory. Might take a long time to finish in case + * memory is still in use. Primarily useful for memory devices that logically + * unplugged all memory (so it's no longer in use) and want to offline + remove + * that memory. + */ +int offline_and_remove_memory(u64 start, u64 size) +{ + const unsigned long mb_count = size / memory_block_size_bytes(); + uint8_t *online_types, *tmp; + int rc; + + if (!IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes()) || !size) + return -EINVAL; + + /* + * We'll remember the old online type of each memory block, so we can + * try to revert whatever we did when offlining one memory block fails + * after offlining some others succeeded. + */ + online_types = kmalloc_array(mb_count, sizeof(*online_types), + GFP_KERNEL); + if (!online_types) + return -ENOMEM; + /* + * Initialize all states to MMOP_OFFLINE, so when we abort processing in + * try_offline_memory_block(), we'll skip all unprocessed blocks in + * try_reonline_memory_block(). + */ + memset(online_types, MMOP_OFFLINE, mb_count); + + lock_device_hotplug(); + + tmp = online_types; + rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block); + + /* + * In case we succeeded to offline all memory, remove it. + * This cannot fail as it cannot get onlined in the meantime. + */ + if (!rc) { + rc = try_remove_memory(start, size); + if (rc) + pr_err("%s: Failed to remove memory: %d", __func__, rc); + } + + /* + * Rollback what we did. While memory onlining might theoretically fail + * (nacked by a notifier), it barely ever happens. + */ + if (rc) { + tmp = online_types; + walk_memory_blocks(start, size, &tmp, + try_reonline_memory_block); + } + unlock_device_hotplug(); + + kfree(online_types); + return rc; +} +EXPORT_SYMBOL_GPL(offline_and_remove_memory); +#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c new file mode 100644 index 000000000..84e11c2ca --- /dev/null +++ b/mm/mempolicy.c @@ -0,0 +1,3163 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Simple NUMA memory policy for the Linux kernel. + * + * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. + * + * NUMA policy allows the user to give hints in which node(s) memory should + * be allocated. + * + * Support four policies per VMA and per process: + * + * The VMA policy has priority over the process policy for a page fault. + * + * interleave Allocate memory interleaved over a set of nodes, + * with normal fallback if it fails. + * For VMA based allocations this interleaves based on the + * offset into the backing object or offset into the mapping + * for anonymous memory. For process policy an process counter + * is used. + * + * bind Only allocate memory on a specific set of nodes, + * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * + * preferred Try a specific node first before normal fallback. + * As a special case NUMA_NO_NODE here means do the allocation + * on the local CPU. This is normally identical to default, + * but useful to set in a VMA when you have a non default + * process policy. + * + * preferred many Try a set of nodes first before normal fallback. This is + * similar to preferred without the special case. + * + * default Allocate on the local node first, or when on a VMA + * use the process policy. This is what Linux always did + * in a NUMA aware kernel and still does by, ahem, default. + * + * The process policy is applied for most non interrupt memory allocations + * in that process' context. Interrupts ignore the policies and always + * try to allocate on the local CPU. The VMA policy is only applied for memory + * allocations for a VMA in the VM. + * + * Currently there are a few corner cases in swapping where the policy + * is not applied, but the majority should be handled. When process policy + * is used it is not remembered over swap outs/swap ins. + * + * Only the highest zone in the zone hierarchy gets policied. Allocations + * requesting a lower zone just use default policy. This implies that + * on systems with highmem kernel lowmem allocation don't get policied. + * Same with GFP_DMA allocations. + * + * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * all users and remembered even when nobody has memory mapped. + */ + +/* Notebook: + fix mmap readahead to honour policy and enable policy for any page cache + object + statistics for bigpages + global policy for page cache? currently it uses process policy. Requires + first item above. + handle mremap for shared memory (currently ignored for the policy) + grows down? + make bind policy root only? It can trigger oom much faster and the + kernel is not always grateful with that. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +/* Internal flags */ +#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ + +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; + +/* Highest zone. An specific allocation for a zone below that is not + policied. */ +enum zone_type policy_zone = 0; + +/* + * run-time system-wide default policy => local allocation + */ +static struct mempolicy default_policy = { + .refcnt = ATOMIC_INIT(1), /* never free it */ + .mode = MPOL_LOCAL, +}; + +static struct mempolicy preferred_node_policy[MAX_NUMNODES]; + +/** + * numa_map_to_online_node - Find closest online node + * @node: Node id to start the search + * + * Lookup the next closest node by distance if @nid is not online. + * + * Return: this @node if it is online, otherwise the closest node by distance + */ +int numa_map_to_online_node(int node) +{ + int min_dist = INT_MAX, dist, n, min_node; + + if (node == NUMA_NO_NODE || node_online(node)) + return node; + + min_node = node; + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_map_to_online_node); + +struct mempolicy *get_task_policy(struct task_struct *p) +{ + struct mempolicy *pol = p->mempolicy; + int node; + + if (pol) + return pol; + + node = numa_node_id(); + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* preferred_node_policy is not initialised early in boot */ + if (pol->mode) + return pol; + } + + return &default_policy; +} + +static const struct mempolicy_operations { + int (*create)(struct mempolicy *pol, const nodemask_t *nodes); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); +} mpol_ops[MPOL_MAX]; + +static inline int mpol_store_user_nodemask(const struct mempolicy *pol) +{ + return pol->flags & MPOL_MODE_FLAGS; +} + +static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, + const nodemask_t *rel) +{ + nodemask_t tmp; + nodes_fold(tmp, *orig, nodes_weight(*rel)); + nodes_onto(*ret, tmp, *rel); +} + +static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + pol->nodes = *nodes; + return 0; +} + +static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) +{ + if (nodes_empty(*nodes)) + return -EINVAL; + + nodes_clear(pol->nodes); + node_set(first_node(*nodes), pol->nodes); + return 0; +} + +/* + * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if + * any, for the new policy. mpol_new() has already validated the nodes + * parameter with respect to the policy mode and flags. + * + * Must be called holding task's alloc_lock to protect task's mems_allowed + * and mempolicy. May also be called holding the mmap_lock for write. + */ +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) +{ + int ret; + + /* + * Default (pol==NULL) resp. local memory policies are not a + * subject of any remapping. They also do not need any special + * constructor. + */ + if (!pol || pol->mode == MPOL_LOCAL) + return 0; + + /* Check N_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_MEMORY]); + + VM_BUG_ON(!nodes); + + if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); + else + nodes_and(nsc->mask2, *nodes, nsc->mask1); + + if (mpol_store_user_nodemask(pol)) + pol->w.user_nodemask = *nodes; + else + pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; + + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + return ret; +} + +/* + * This function just creates a new policy, does some check and simple + * initialization. You must invoke mpol_set_nodemask() to set nodes. + */ +static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *policy; + + pr_debug("setting mode %d flags %d nodes[0] %lx\n", + mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); + + if (mode == MPOL_DEFAULT) { + if (nodes && !nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + return NULL; + } + VM_BUG_ON(!nodes); + + /* + * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or + * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). + * All other modes require a valid pointer to a non-empty nodemask. + */ + if (mode == MPOL_PREFERRED) { + if (nodes_empty(*nodes)) { + if (((flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES))) + return ERR_PTR(-EINVAL); + + mode = MPOL_LOCAL; + } + } else if (mode == MPOL_LOCAL) { + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) + return ERR_PTR(-EINVAL); + } else if (nodes_empty(*nodes)) + return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!policy) + return ERR_PTR(-ENOMEM); + atomic_set(&policy->refcnt, 1); + policy->mode = mode; + policy->flags = flags; + policy->home_node = NUMA_NO_NODE; + + return policy; +} + +/* Slow path of a mpol destructor. */ +void __mpol_put(struct mempolicy *p) +{ + if (!atomic_dec_and_test(&p->refcnt)) + return; + kmem_cache_free(policy_cache, p); +} + +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +{ +} + +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +{ + nodemask_t tmp; + + if (pol->flags & MPOL_F_STATIC_NODES) + nodes_and(tmp, pol->w.user_nodemask, *nodes); + else if (pol->flags & MPOL_F_RELATIVE_NODES) + mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); + else { + nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = *nodes; + } + + if (nodes_empty(tmp)) + tmp = *nodes; + + pol->nodes = tmp; +} + +static void mpol_rebind_preferred(struct mempolicy *pol, + const nodemask_t *nodes) +{ + pol->w.cpuset_mems_allowed = *nodes; +} + +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * Per-vma policies are protected by mmap_lock. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +{ + if (!pol || pol->mode == MPOL_LOCAL) + return; + if (!mpol_store_user_nodemask(pol) && + nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) + return; + + mpol_ops[pol->mode].rebind(pol, newmask); +} + +/* + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + * + * Called with task's alloc_lock held. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +{ + mpol_rebind_policy(tsk->mempolicy, new); +} + +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_lock during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + mmap_write_lock(mm); + for_each_vma(vmi, vma) + mpol_rebind_policy(vma->vm_policy, new); + mmap_write_unlock(mm); +} + +static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { + [MPOL_DEFAULT] = { + .rebind = mpol_rebind_default, + }, + [MPOL_INTERLEAVE] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, + [MPOL_PREFERRED] = { + .create = mpol_new_preferred, + .rebind = mpol_rebind_preferred, + }, + [MPOL_BIND] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_nodemask, + }, + [MPOL_LOCAL] = { + .rebind = mpol_rebind_default, + }, + [MPOL_PREFERRED_MANY] = { + .create = mpol_new_nodemask, + .rebind = mpol_rebind_preferred, + }, +}; + +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, + unsigned long flags); + +struct queue_pages { + struct list_head *pagelist; + unsigned long flags; + nodemask_t *nmask; + unsigned long start; + unsigned long end; + struct vm_area_struct *first; + bool has_unmovable; +}; + +/* + * Check if the page's nid is in qp->nmask. + * + * If MPOL_MF_INVERT is set in qp->flags, check if the nid is + * in the invert of qp->nmask. + */ +static inline bool queue_pages_required(struct page *page, + struct queue_pages *qp) +{ + int nid = page_to_nid(page); + unsigned long flags = qp->flags; + + return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); +} + +/* + * queue_folios_pmd() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or + * special page is met, i.e. zero page, or unmovable page is found + * but continue walking (indicated by queue_pages.has_unmovable). + * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an + * existing folio was already on a node that does not follow the + * policy. + */ +static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, + unsigned long end, struct mm_walk *walk) + __releases(ptl) +{ + int ret = 0; + struct folio *folio; + struct queue_pages *qp = walk->private; + unsigned long flags; + + if (unlikely(is_pmd_migration_entry(*pmd))) { + ret = -EIO; + goto unlock; + } + folio = pfn_folio(pmd_pfn(*pmd)); + if (is_huge_zero_page(&folio->page)) { + walk->action = ACTION_CONTINUE; + goto unlock; + } + if (!queue_pages_required(&folio->page, qp)) + goto unlock; + + flags = qp->flags; + /* go to folio migration */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(walk->vma) || + migrate_folio_add(folio, qp->pagelist, flags)) { + qp->has_unmovable = true; + goto unlock; + } + } else + ret = -EIO; +unlock: + spin_unlock(ptl); + return ret; +} + +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + * + * queue_folios_pte_range() has three possible return values: + * 0 - folios are placed on the right node or queued successfully, or + * special page is met, i.e. zero page, or unmovable page is found + * but continue walking (indicated by queue_pages.has_unmovable). + * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already + * on a node that does not follow the policy. + */ +static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct folio *folio; + struct queue_pages *qp = walk->private; + unsigned long flags = qp->flags; + pte_t *pte, *mapped_pte; + spinlock_t *ptl; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) + return queue_folios_pmd(pmd, ptl, addr, end, walk); + + if (pmd_trans_unstable(pmd)) + return 0; + + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + folio = vm_normal_folio(vma, addr, *pte); + if (!folio || folio_is_zone_device(folio)) + continue; + /* + * vm_normal_folio() filters out zero pages, but there might + * still be reserved folios to skip, perhaps in a VDSO. + */ + if (folio_test_reserved(folio)) + continue; + if (!queue_pages_required(&folio->page, qp)) + continue; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + /* + * MPOL_MF_STRICT must be specified if we get here. + * Continue walking vmas due to MPOL_MF_MOVE* flags. + */ + if (!vma_migratable(vma)) + qp->has_unmovable = true; + + /* + * Do not abort immediately since there may be + * temporary off LRU pages in the range. Still + * need migrate other LRU pages. + */ + if (migrate_folio_add(folio, qp->pagelist, flags)) + qp->has_unmovable = true; + } else + break; + } + pte_unmap_unlock(mapped_pte, ptl); + cond_resched(); + + return addr != end ? -EIO : 0; +} + +static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + int ret = 0; +#ifdef CONFIG_HUGETLB_PAGE + struct queue_pages *qp = walk->private; + unsigned long flags = (qp->flags & MPOL_MF_VALID); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto unlock; + page = pte_page(entry); + if (!queue_pages_required(page, qp)) + goto unlock; + + if (flags == MPOL_MF_STRICT) { + /* + * STRICT alone means only detecting misplaced page and no + * need to further check other vma. + */ + ret = -EIO; + goto unlock; + } + + if (!vma_migratable(walk->vma)) { + /* + * Must be STRICT with MOVE*, otherwise .test_walk() have + * stopped walking current vma. + * Detecting misplaced page but allow migrating pages which + * have been queued. + */ + qp->has_unmovable = true; + goto unlock; + } + + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 && + !hugetlb_pmd_shared(pte))) { + if (isolate_hugetlb(page, qp->pagelist) && + (flags & MPOL_MF_STRICT)) + /* + * Failed to isolate page but allow migrating pages + * which have been queued. + */ + qp->has_unmovable = true; + } +unlock: + spin_unlock(ptl); +#else + BUG(); +#endif + return ret; +} + +#ifdef CONFIG_NUMA_BALANCING +/* + * This is used to mark a range of virtual addresses to be inaccessible. + * These are later cleared by a NUMA hinting fault. Depending on these + * faults, pages may be migrated for better NUMA placement. + * + * This is assuming that NUMA faults are handled using PROT_NONE. If + * an architecture makes a different choice, it will need further + * changes to the core. + */ +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mmu_gather tlb; + int nr_updated; + + tlb_gather_mmu(&tlb, vma->vm_mm); + + nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE, + MM_CP_PROT_NUMA); + if (nr_updated) + count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + + tlb_finish_mmu(&tlb); + + return nr_updated; +} +#else +static unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + +static int queue_pages_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *next, *vma = walk->vma; + struct queue_pages *qp = walk->private; + unsigned long endvma = vma->vm_end; + unsigned long flags = qp->flags; + + /* range check first */ + VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); + + if (!qp->first) { + qp->first = vma; + if (!(flags & MPOL_MF_DISCONTIG_OK) && + (qp->start < vma->vm_start)) + /* hole at head side of range */ + return -EFAULT; + } + next = find_vma(vma->vm_mm, vma->vm_end); + if (!(flags & MPOL_MF_DISCONTIG_OK) && + ((vma->vm_end < qp->end) && + (!next || vma->vm_end < next->vm_start))) + /* hole at middle or tail of range */ + return -EFAULT; + + /* + * Need check MPOL_MF_STRICT to return -EIO if possible + * regardless of vma_migratable + */ + if (!vma_migratable(vma) && + !(flags & MPOL_MF_STRICT)) + return 1; + + if (endvma > end) + endvma = end; + + if (flags & MPOL_MF_LAZY) { + /* Similar to task_numa_work, skip inaccessible VMAs */ + if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && + !(vma->vm_flags & VM_MIXEDMAP)) + change_prot_numa(vma, start, endvma); + return 1; + } + + /* queue pages from current vma */ + if (flags & MPOL_MF_VALID) + return 0; + return 1; +} + +static const struct mm_walk_ops queue_pages_walk_ops = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, +}; + +/* + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private. + * + * queue_pages_range() has three possible return values: + * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were + * specified. + * 0 - queue pages successfully or no misplaced page. + * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or + * memory range specified by nodemask and maxnode points outside + * your accessible address space (-EFAULT) + */ +static int +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, + nodemask_t *nodes, unsigned long flags, + struct list_head *pagelist) +{ + int err; + struct queue_pages qp = { + .pagelist = pagelist, + .flags = flags, + .nmask = nodes, + .start = start, + .end = end, + .first = NULL, + .has_unmovable = false, + }; + + err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + + if (qp.has_unmovable) + err = 1; + if (!qp.first) + /* whole range in hole */ + err = -EFAULT; + + return err; +} + +/* + * Apply policy to a single VMA + * This must be called with the mmap_lock held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) +{ + int err; + struct mempolicy *old; + struct mempolicy *new; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { + err = vma->vm_ops->set_policy(vma, new); + if (err) + goto err_out; + } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_lock */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); + return err; +} + +/* Split or merge the VMA (if required) and apply the new policy */ +static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, struct mempolicy *new_pol) +{ + struct vm_area_struct *merged; + unsigned long vmstart, vmend; + pgoff_t pgoff; + int err; + + vmend = min(end, vma->vm_end); + if (start > vma->vm_start) { + *prev = vma; + vmstart = start; + } else { + vmstart = vma->vm_start; + } + + if (mpol_equal(vma_policy(vma), new_pol)) { + *prev = vma; + return 0; + } + + pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); + merged = vma_merge(vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol, + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (merged) { + *prev = merged; + /* vma_merge() invalidated the mas */ + mas_pause(&vmi->mas); + return vma_replace_policy(merged, new_pol); + } + + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + return err; + /* split_vma() invalidated the mas */ + mas_pause(&vmi->mas); + } + + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + return err; + /* split_vma() invalidated the mas */ + mas_pause(&vmi->mas); + } + + *prev = vma; + return vma_replace_policy(vma, new_pol); +} + +/* Set the process memory policy */ +static long do_set_mempolicy(unsigned short mode, unsigned short flags, + nodemask_t *nodes) +{ + struct mempolicy *new, *old; + NODEMASK_SCRATCH(scratch); + int ret; + + if (!scratch) + return -ENOMEM; + + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } + + task_lock(current); + ret = mpol_set_nodemask(new, nodes, scratch); + if (ret) { + task_unlock(current); + mpol_put(new); + goto out; + } + + old = current->mempolicy; + current->mempolicy = new; + if (new && new->mode == MPOL_INTERLEAVE) + current->il_prev = MAX_NUMNODES-1; + task_unlock(current); + mpol_put(old); + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; +} + +/* + * Return nodemask for policy for get_mempolicy() query + * + * Called with task's alloc_lock held + */ +static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +{ + nodes_clear(*nodes); + if (p == &default_policy) + return; + + switch (p->mode) { + case MPOL_BIND: + case MPOL_INTERLEAVE: + case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: + *nodes = p->nodes; + break; + case MPOL_LOCAL: + /* return empty node mask for local allocation */ + break; + default: + BUG(); + } +} + +static int lookup_node(struct mm_struct *mm, unsigned long addr) +{ + struct page *p = NULL; + int ret; + + ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p); + if (ret > 0) { + ret = page_to_nid(p); + put_page(p); + } + return ret; +} + +/* Retrieve NUMA policy */ +static long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) +{ + int err; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; + + if (flags & + ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) + return -EINVAL; + + if (flags & MPOL_F_MEMS_ALLOWED) { + if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + *policy = 0; /* just so it's initialized */ + task_lock(current); + *nmask = cpuset_current_mems_allowed; + task_unlock(current); + return 0; + } + + if (flags & MPOL_F_ADDR) { + /* + * Do NOT fall back to task policy if the + * vma/shared policy at addr is NULL. We + * want to return MPOL_DEFAULT in this case. + */ + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) { + mmap_read_unlock(mm); + return -EFAULT; + } + if (vma->vm_ops && vma->vm_ops->get_policy) + pol = vma->vm_ops->get_policy(vma, addr); + else + pol = vma->vm_policy; + } else if (addr) + return -EINVAL; + + if (!pol) + pol = &default_policy; /* indicates default behavior */ + + if (flags & MPOL_F_NODE) { + if (flags & MPOL_F_ADDR) { + /* + * Take a refcount on the mpol, because we are about to + * drop the mmap_lock, after which only "pol" remains + * valid, "vma" is stale. + */ + pol_refcount = pol; + vma = NULL; + mpol_get(pol); + mmap_read_unlock(mm); + err = lookup_node(mm, addr); + if (err < 0) + goto out; + *policy = err; + } else if (pol == current->mempolicy && + pol->mode == MPOL_INTERLEAVE) { + *policy = next_node_in(current->il_prev, pol->nodes); + } else { + err = -EINVAL; + goto out; + } + } else { + *policy = pol == &default_policy ? MPOL_DEFAULT : + pol->mode; + /* + * Internal mempolicy flags must be masked off before exposing + * the policy to userspace. + */ + *policy |= (pol->flags & MPOL_MODE_FLAGS); + } + + err = 0; + if (nmask) { + if (mpol_store_user_nodemask(pol)) { + *nmask = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, nmask); + task_unlock(current); + } + } + + out: + mpol_cond_put(pol); + if (vma) + mmap_read_unlock(mm); + if (pol_refcount) + mpol_put(pol_refcount); + return err; +} + +#ifdef CONFIG_MIGRATION +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, + unsigned long flags) +{ + /* + * We try to migrate only unshared folios. If it is shared it + * is likely not worth migrating. + * + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. + */ + if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { + if (!folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, foliolist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + } else if (flags & MPOL_MF_STRICT) { + /* + * Non-movable folio may reach here. And, there may be + * temporary off LRU folios or non-LRU movable folios. + * Treat them as unmovable folios since they can't be + * isolated, so they can't be moved at the moment. It + * should return -EIO for this case too. + */ + return -EIO; + } + } + + return 0; +} + +/* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +static int migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) +{ + nodemask_t nmask; + struct vm_area_struct *vma; + LIST_HEAD(pagelist); + int err = 0; + struct migration_target_control mtc = { + .nid = dest, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; + + nodes_clear(nmask); + node_set(source, nmask); + + /* + * This does not "check" the range but isolates all pages that + * need migration. Between passing in the full user address + * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + */ + vma = find_vma(mm, 0); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + if (err) + putback_movable_pages(&pagelist); + } + + return err; +} + +/* + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) +{ + int busy = 0; + int err = 0; + nodemask_t tmp; + + lru_cache_disable(); + + mmap_read_lock(mm); + + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scanning from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from; + while (!nodes_empty(tmp)) { + int s, d; + int source = NUMA_NO_NODE; + int dest = 0; + + for_each_node_mask(s, tmp) { + + /* + * do_migrate_pages() tries to maintain the relative + * node relationship of the pages established between + * threads and memory areas. + * + * However if the number of source nodes is not equal to + * the number of destination nodes we can not preserve + * this node relative relationship. In that case, skip + * copying memory from a node that is in the destination + * mask. + * + * Example: [2,3,4] -> [3,4,5] moves everything. + * [0-7] - > [3,4,5] moves only 0,1,2,6,7. + */ + + if ((nodes_weight(*from) != nodes_weight(*to)) && + (node_isset(s, *to))) + continue; + + d = node_remap(s, *from, *to); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == NUMA_NO_NODE) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } + mmap_read_unlock(mm); + + lru_cache_enable(); + if (err < 0) + return err; + return busy; + +} + +/* + * Allocate a new page for page migration based on vma policy. + * Start by assuming the page is mapped by the same vma as contains @start. + * Search forward from there, if not. N.B., this assumes that the + * list of pages handed to migrate_pages()--which is how we get here-- + * is in virtual address order. + */ +static struct page *new_page(struct page *page, unsigned long start) +{ + struct folio *dst, *src = page_folio(page); + struct vm_area_struct *vma; + unsigned long address; + VMA_ITERATOR(vmi, current->mm, start); + gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; + + for_each_vma(vmi, vma) { + address = page_address_in_vma(page, vma); + if (address != -EFAULT) + break; + } + + if (folio_test_hugetlb(src)) + return alloc_huge_page_vma(page_hstate(&src->page), + vma, address); + + if (folio_test_large(src)) + gfp = GFP_TRANSHUGE; + + /* + * if !vma, vma_alloc_folio() will use task or system default policy + */ + dst = vma_alloc_folio(gfp, folio_order(src), vma, address, + folio_test_large(src)); + return &dst->page; +} +#else + +static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, + unsigned long flags) +{ + return -EIO; +} + +int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to, int flags) +{ + return -ENOSYS; +} + +static struct page *new_page(struct page *page, unsigned long start) +{ + return NULL; +} +#endif + +static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vma_iterator vmi; + struct mempolicy *new; + unsigned long end; + int err; + int ret; + LIST_HEAD(pagelist); + + if (flags & ~(unsigned long)MPOL_MF_VALID) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = PAGE_ALIGN(len); + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + new = mpol_new(mode, mode_flags, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (flags & MPOL_MF_LAZY) + new->flags |= MPOL_F_MOF; + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", + start, start + len, mode, mode_flags, + nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); + + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + lru_cache_disable(); + } + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + mmap_write_lock(mm); + err = mpol_set_nodemask(new, nmask, scratch); + if (err) + mmap_write_unlock(mm); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } + if (err) + goto mpol_out; + + ret = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + if (ret < 0) { + err = ret; + goto up_out; + } + + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } + + if (!err) { + int nr_failed = 0; + + if (!list_empty(&pagelist)) { + WARN_ON_ONCE(flags & MPOL_MF_LAZY); + nr_failed = migrate_pages(&pagelist, new_page, NULL, + start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); + if (nr_failed) + putback_movable_pages(&pagelist); + } + + if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT)) + err = -EIO; + } else { +up_out: + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); + } + + mmap_write_unlock(mm); +mpol_out: + mpol_put(new); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + lru_cache_enable(); + return err; +} + +/* + * User space interface with variable sized bitmaps for nodelists. + */ +static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long nlongs = BITS_TO_LONGS(maxnode); + int ret; + + if (in_compat_syscall()) + ret = compat_get_bitmap(mask, + (const compat_ulong_t __user *)nmask, + maxnode); + else + ret = copy_from_user(mask, nmask, + nlongs * sizeof(unsigned long)); + + if (ret) + return -EFAULT; + + if (maxnode % BITS_PER_LONG) + mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1; + + return 0; +} + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, + unsigned long maxnode) +{ + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; + + /* + * When the user specified more nodes than supported just check + * if the non supported part is all zero, one word at a time, + * starting at the end. + */ + while (maxnode > MAX_NUMNODES) { + unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); + unsigned long t; + + if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) + return -EFAULT; + + if (maxnode - bits >= MAX_NUMNODES) { + maxnode -= bits; + } else { + maxnode = MAX_NUMNODES; + t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + } + if (t) + return -EINVAL; + } + + return get_bitmap(nodes_addr(*nodes), nmask, maxnode); +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); + bool compat = in_compat_syscall(); + + if (compat) + nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + maxnode = nr_node_ids; + } + + if (compat) + return compat_put_bitmap((compat_ulong_t __user *)mask, + nodes_addr(*nodes), maxnode); + + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +/* Basic parameter sanity check used by both mbind() and set_mempolicy() */ +static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) +{ + *flags = *mode & MPOL_MODE_FLAGS; + *mode &= ~MPOL_MODE_FLAGS; + + if ((unsigned int)(*mode) >= MPOL_MAX) + return -EINVAL; + if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES)) + return -EINVAL; + if (*flags & MPOL_F_NUMA_BALANCING) { + if (*mode != MPOL_BIND) + return -EINVAL; + *flags |= (MPOL_F_MOF | MPOL_F_MORON); + } + return 0; +} + +static long kernel_mbind(unsigned long start, unsigned long len, + unsigned long mode, const unsigned long __user *nmask, + unsigned long maxnode, unsigned int flags) +{ + unsigned short mode_flags; + nodemask_t nodes; + int lmode = mode; + int err; + + start = untagged_addr(start); + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + + return do_mbind(start, len, lmode, mode_flags, &nodes, flags); +} + +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct mempolicy *new; + unsigned long end; + int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = PAGE_ALIGN(len); + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) { + prev = vma; + continue; + } + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + mpol_put(new); + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(&vmi, vma, &prev, start, end, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, const unsigned long __user *, nmask, + unsigned long, maxnode, unsigned int, flags) +{ + return kernel_mbind(start, len, mode, nmask, maxnode, flags); +} + +/* Set the process memory policy */ +static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned short mode_flags; + nodemask_t nodes; + int lmode = mode; + int err; + + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + return err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + + return do_set_mempolicy(lmode, mode_flags, &nodes); +} + +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, + unsigned long, maxnode) +{ + return kernel_set_mempolicy(mode, nmask, maxnode); +} + +static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) +{ + struct mm_struct *mm = NULL; + struct task_struct *task; + nodemask_t task_nodes; + int err; + nodemask_t *old; + nodemask_t *new; + NODEMASK_SCRATCH(scratch); + + if (!scratch) + return -ENOMEM; + + old = &scratch->mask1; + new = &scratch->mask2; + + err = get_nodes(old, old_nodes, maxnode); + if (err) + goto out; + + err = get_nodes(new, new_nodes, maxnode); + if (err) + goto out; + + /* Find the mm_struct */ + rcu_read_lock(); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + rcu_read_unlock(); + err = -ESRCH; + goto out; + } + get_task_struct(task); + + err = -EINVAL; + + /* + * Check if this process has the right to modify the specified process. + * Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + err = -EPERM; + goto out_put; + } + rcu_read_unlock(); + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out_put; + } + + task_nodes = cpuset_mems_allowed(current); + nodes_and(*new, *new, task_nodes); + if (nodes_empty(*new)) + goto out_put; + + err = security_task_movememory(task); + if (err) + goto out_put; + + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + err = -EINVAL; + goto out; + } + + err = do_migrate_pages(mm, old, new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); + + mmput(mm); +out: + NODEMASK_SCRATCH_FREE(scratch); + + return err; + +out_put: + put_task_struct(task); + goto out; + +} + +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) +{ + return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); +} + + +/* Retrieve NUMA policy */ +static int kernel_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, + unsigned long flags) +{ + int err; + int pval; + nodemask_t nodes; + + if (nmask != NULL && maxnode < nr_node_ids) + return -EINVAL; + + addr = untagged_addr(addr); + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) +{ + return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); +} + +bool vma_migratable(struct vm_area_struct *vma) +{ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + return false; + + /* + * DAX device mappings require predictable access latency, so avoid + * incurring periodic faults. + */ + if (vma_is_dax(vma)) + return false; + + if (is_vm_hugetlb_page(vma) && + !hugepage_migration_supported(hstate_vma(vma))) + return false; + + /* + * Migration allocates pages in the highest zone. If we cannot + * do so then migration (at least from node to node) is not + * possible. + */ + if (vma->vm_file && + gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) + < policy_zone) + return false; + return true; +} + +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = NULL; + + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + pol = vma->vm_ops->get_policy(vma, addr); + } else if (vma->vm_policy) { + pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } + } + + return pol; +} + +/* + * get_vma_policy(@vma, @addr) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to current->mempolicy or system default policy, as necessary. + * Shared policies [those marked as MPOL_F_SHARED] require an extra reference + * count--added by the get_policy() vm_op, as appropriate--to protect against + * freeing by another task. It is the caller's responsibility to free the + * extra reference for shared policies. + */ +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct mempolicy *pol = __get_vma_policy(vma, addr); + + if (!pol) + pol = get_task_policy(current); + + return pol; +} + +bool vma_policy_mof(struct vm_area_struct *vma) +{ + struct mempolicy *pol; + + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } + + pol = vma->vm_policy; + if (!pol) + pol = get_task_policy(current); + + return pol->flags & MPOL_F_MOF; +} + +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +{ + enum zone_type dynamic_policy_zone = policy_zone; + + BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); + + /* + * if policy->nodes has movable memory only, + * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. + * + * policy->nodes is intersect with node_states[N_MEMORY]. + * so if the following test fails, it implies + * policy->nodes has movable memory only. + */ + if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY])) + dynamic_policy_zone = ZONE_MOVABLE; + + return zone >= dynamic_policy_zone; +} + +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation + */ +nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) +{ + int mode = policy->mode; + + /* Lower zones don't get a nodemask applied for MPOL_BIND */ + if (unlikely(mode == MPOL_BIND) && + apply_policy_zone(policy, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&policy->nodes)) + return &policy->nodes; + + if (mode == MPOL_PREFERRED_MANY) + return &policy->nodes; + + return NULL; +} + +/* + * Return the preferred node id for 'prefer' mempolicy, and return + * the given id for all other policies. + * + * policy_node() is always coupled with policy_nodemask(), which + * secures the nodemask limit for 'bind' and 'prefer-many' policy. + */ +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) +{ + if (policy->mode == MPOL_PREFERRED) { + nd = first_node(policy->nodes); + } else { + /* + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. + */ + WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); + } + + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + + return nd; +} + +/* Do dynamic interleaving for a process */ +static unsigned interleave_nodes(struct mempolicy *policy) +{ + unsigned next; + struct task_struct *me = current; + + next = next_node_in(me->il_prev, policy->nodes); + if (next < MAX_NUMNODES) + me->il_prev = next; + return next; +} + +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned int mempolicy_slab_node(void) +{ + struct mempolicy *policy; + int node = numa_mem_id(); + + if (!in_task()) + return node; + + policy = current->mempolicy; + if (!policy) + return node; + + switch (policy->mode) { + case MPOL_PREFERRED: + return first_node(policy->nodes); + + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + case MPOL_PREFERRED_MANY: + { + struct zoneref *z; + + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + struct zonelist *zonelist; + enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); + zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->nodes); + return z->zone ? zone_to_nid(z->zone) : node; + } + case MPOL_LOCAL: + return node; + + default: + BUG(); + } +} + +/* + * Do static interleaving for a VMA with known offset @n. Returns the n'th + * node in pol->nodes (starting from n=0), wrapping around if n exceeds the + * number of present nodes. + */ +static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +{ + nodemask_t nodemask = pol->nodes; + unsigned int target, nnodes; + int i; + int nid; + /* + * The barrier will stabilize the nodemask in a register or on + * the stack so that it will stop changing under the code. + * + * Between first_node() and next_node(), pol->nodes could be changed + * by other threads. So we put pol->nodes in a local stack. + */ + barrier(); + + nnodes = nodes_weight(nodemask); + if (!nnodes) + return numa_node_id(); + target = (unsigned int)n % nnodes; + nid = first_node(nodemask); + for (i = 0; i < target; i++) + nid = next_node(nid, nodemask); + return nid; +} + +/* Determine a node number for interleave */ +static inline unsigned interleave_nid(struct mempolicy *pol, + struct vm_area_struct *vma, unsigned long addr, int shift) +{ + if (vma) { + unsigned long off; + + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); + off += (addr - vma->vm_start) >> shift; + return offset_il_node(pol, off); + } else + return interleave_nodes(pol); +} + +#ifdef CONFIG_HUGETLBFS +/* + * huge_node(@vma, @addr, @gfp_flags, @mpol) + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy + * + * Returns a nid suitable for a huge page allocation and a pointer + * to the struct mempolicy for conditional unref after allocation. + * If the effective policy is 'bind' or 'prefer-many', returns a pointer + * to the mempolicy's @nodemask for filtering the zonelist. + * + * Must be protected by read_mems_allowed_begin() + */ +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) +{ + int nid; + int mode; + + *mpol = get_vma_policy(vma, addr); + *nodemask = NULL; + mode = (*mpol)->mode; + + if (unlikely(mode == MPOL_INTERLEAVE)) { + nid = interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))); + } else { + nid = policy_node(gfp_flags, *mpol, numa_node_id()); + if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) + *nodemask = &(*mpol)->nodes; + } + return nid; +} + +/* + * init_nodemask_of_mempolicy + * + * If the current task's mempolicy is "default" [NULL], return 'false' + * to indicate default policy. Otherwise, extract the policy nodemask + * for 'bind' or 'interleave' policy into the argument nodemask, or + * initialize the argument nodemask to contain the single node for + * 'preferred' or 'local' policy and return 'true' to indicate presence + * of non-default mempolicy. + * + * We don't bother with reference counting the mempolicy [mpol_get/put] + * because the current task is examining it's own mempolicy and a task's + * mempolicy is only ever changed by the task itself. + * + * N.B., it is the caller's responsibility to free a returned nodemask. + */ +bool init_nodemask_of_mempolicy(nodemask_t *mask) +{ + struct mempolicy *mempolicy; + + if (!(mask && current->mempolicy)) + return false; + + task_lock(current); + mempolicy = current->mempolicy; + switch (mempolicy->mode) { + case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + case MPOL_INTERLEAVE: + *mask = mempolicy->nodes; + break; + + case MPOL_LOCAL: + init_nodemask_of_node(mask, numa_node_id()); + break; + + default: + BUG(); + } + task_unlock(current); + + return true; +} +#endif + +/* + * mempolicy_in_oom_domain + * + * If tsk's mempolicy is "bind", check for intersection between mask and + * the policy nodemask. Otherwise, return true for all other policies + * including "interleave", as a tsk with "interleave" policy may have + * memory allocated from all nodes in system. + * + * Takes task_lock(tsk) to prevent freeing of its mempolicy. + */ +bool mempolicy_in_oom_domain(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct mempolicy *mempolicy; + bool ret = true; + + if (!mask) + return ret; + + task_lock(tsk); + mempolicy = tsk->mempolicy; + if (mempolicy && mempolicy->mode == MPOL_BIND) + ret = nodes_intersects(mempolicy->nodes, *mask); + task_unlock(tsk); + + return ret; +} + +/* Allocate a page in interleaved policy. + Own path because it needs to do special accounting. */ +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) +{ + struct page *page; + + page = __alloc_pages(gfp, order, nid, NULL); + /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return page; + if (page && page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); + } + return page; +} + +static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + int nid, struct mempolicy *pol) +{ + struct page *page; + gfp_t preferred_gfp; + + /* + * This is a two pass approach. The first pass will only try the + * preferred nodes but skip the direct reclaim and allow the + * allocation to fail, while the second pass will try all the + * nodes in system. + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + if (!page) + page = __alloc_pages(gfp, order, nid, NULL); + + return page; +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @hugepage: For hugepages try only the preferred node if possible. + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock + * of the mm_struct of the VMA to prevent it from going away. Should be + * used for all allocations for folios that will be mapped into user space. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) +{ + struct mempolicy *pol; + int node = numa_node_id(); + struct folio *folio; + int preferred_nid; + nodemask_t *nmask; + + pol = get_vma_policy(vma, addr); + + if (pol->mode == MPOL_INTERLEAVE) { + struct page *page; + unsigned nid; + + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); + mpol_cond_put(pol); + gfp |= __GFP_COMP; + page = alloc_page_interleave(gfp, order, nid); + if (page && order > 1) + prep_transhuge_page(page); + folio = (struct folio *)page; + goto out; + } + + if (pol->mode == MPOL_PREFERRED_MANY) { + struct page *page; + + node = policy_node(gfp, pol, node); + gfp |= __GFP_COMP; + page = alloc_pages_preferred_many(gfp, order, node, pol); + mpol_cond_put(pol); + if (page && order > 1) + prep_transhuge_page(page); + folio = (struct folio *)page; + goto out; + } + + if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { + int hpage_node = node; + + /* + * For hugepage allocation and non-interleave policy which + * allows the current node (or other explicitly preferred + * node) we only try to allocate from the current/preferred + * node and don't fall back to other nodes, as the cost of + * remote accesses would likely offset THP benefits. + * + * If the policy is interleave or does not allow the current + * node in its nodemask, we allocate the standard way. + */ + if (pol->mode == MPOL_PREFERRED) + hpage_node = first_node(pol->nodes); + + nmask = policy_nodemask(gfp, pol); + if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); + /* + * First, try to allocate THP only on local node, but + * don't reclaim unnecessarily, just compact. + */ + folio = __folio_alloc_node(gfp | __GFP_THISNODE | + __GFP_NORETRY, order, hpage_node); + + /* + * If hugepage allocations are configured to always + * synchronous compact or the vma has been madvised + * to prefer hugepage backing, retry allowing remote + * memory with both reclaim and compact as well. + */ + if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) + folio = __folio_alloc(gfp, order, hpage_node, + nmask); + + goto out; + } + } + + nmask = policy_nodemask(gfp, pol); + preferred_nid = policy_node(gfp, pol, node); + folio = __folio_alloc(gfp, order, preferred_nid, nmask); + mpol_cond_put(pol); +out: + return folio; +} +EXPORT_SYMBOL(vma_alloc_folio); + +/** + * alloc_pages - Allocate pages. + * @gfp: GFP flags. + * @order: Power of two of number of pages to allocate. + * + * Allocate 1 << @order contiguous pages. The physical address of the + * first page is naturally aligned (eg an order-3 allocation will be aligned + * to a multiple of 8 * PAGE_SIZE bytes). The NUMA policy of the current + * process is honoured when in process context. + * + * Context: Can be called from any context, providing the appropriate GFP + * flags are used. + * Return: The page on success or NULL if allocation fails. + */ +struct page *alloc_pages(gfp_t gfp, unsigned order) +{ + struct mempolicy *pol = &default_policy; + struct page *page; + + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + /* + * No reference counting needed for current->mempolicy + * nor system default_policy + */ + if (pol->mode == MPOL_INTERLEAVE) + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else if (pol->mode == MPOL_PREFERRED_MANY) + page = alloc_pages_preferred_many(gfp, order, + policy_node(gfp, pol, numa_node_id()), pol); + else + page = __alloc_pages(gfp, order, + policy_node(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); + + return page; +} +EXPORT_SYMBOL(alloc_pages); + +struct folio *folio_alloc(gfp_t gfp, unsigned order) +{ + struct page *page = alloc_pages(gfp | __GFP_COMP, order); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(folio_alloc); + +static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + int nodes; + unsigned long nr_pages_per_node; + int delta; + int i; + unsigned long nr_allocated; + unsigned long total_allocated = 0; + + nodes = nodes_weight(pol->nodes); + nr_pages_per_node = nr_pages / nodes; + delta = nr_pages - nodes * nr_pages_per_node; + + for (i = 0; i < nodes; i++) { + if (delta) { + nr_allocated = __alloc_pages_bulk(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node + 1, NULL, + page_array); + delta--; + } else { + nr_allocated = __alloc_pages_bulk(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node, NULL, page_array); + } + + page_array += nr_allocated; + total_allocated += nr_allocated; + } + + return total_allocated; +} + +static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, + struct mempolicy *pol, unsigned long nr_pages, + struct page **page_array) +{ + gfp_t preferred_gfp; + unsigned long nr_allocated = 0; + + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + + nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_pages, NULL, page_array); + + if (nr_allocated < nr_pages) + nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_pages - nr_allocated, NULL, + page_array + nr_allocated); + return nr_allocated; +} + +/* alloc pages bulk and mempolicy should be considered at the + * same time in some situation such as vmalloc. + * + * It can accelerate memory allocation especially interleaving + * allocate memory. + */ +unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + unsigned long nr_pages, struct page **page_array) +{ + struct mempolicy *pol = &default_policy; + + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); + + if (pol->mode == MPOL_INTERLEAVE) + return alloc_pages_bulk_array_interleave(gfp, pol, + nr_pages, page_array); + + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_bulk_array_preferred_many(gfp, + numa_node_id(), pol, nr_pages, page_array); + + return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol), nr_pages, NULL, + page_array); +} + +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + +/* + * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. + */ + +/* Slow path of a mempolicy duplicate */ +struct mempolicy *__mpol_dup(struct mempolicy *old) +{ + struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + + if (!new) + return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(new, &mems); + } + atomic_set(&new->refcnt, 1); + return new; +} + +/* Slow path of a mempolicy comparison */ +bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) +{ + if (!a || !b) + return false; + if (a->mode != b->mode) + return false; + if (a->flags != b->flags) + return false; + if (a->home_node != b->home_node) + return false; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return false; + + switch (a->mode) { + case MPOL_BIND: + case MPOL_INTERLEAVE: + case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: + return !!nodes_equal(a->nodes, b->nodes); + case MPOL_LOCAL: + return true; + default: + BUG(); + return false; + } +} + +/* + * Shared memory backing store policy support. + * + * Remember policies even when nobody has shared memory mapped. + * The policies are kept in Red-Black tree linked from the inode. + * They are protected by the sp->lock rwlock, which should be held + * for any accesses to the tree. + */ + +/* + * lookup first element intersecting start-end. Caller holds sp->lock for + * reading or for writing + */ +static struct sp_node * +sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +{ + struct rb_node *n = sp->root.rb_node; + + while (n) { + struct sp_node *p = rb_entry(n, struct sp_node, nd); + + if (start >= p->end) + n = n->rb_right; + else if (end <= p->start) + n = n->rb_left; + else + break; + } + if (!n) + return NULL; + for (;;) { + struct sp_node *w = NULL; + struct rb_node *prev = rb_prev(n); + if (!prev) + break; + w = rb_entry(prev, struct sp_node, nd); + if (w->end <= start) + break; + n = prev; + } + return rb_entry(n, struct sp_node, nd); +} + +/* + * Insert a new shared policy into the list. Caller holds sp->lock for + * writing. + */ +static void sp_insert(struct shared_policy *sp, struct sp_node *new) +{ + struct rb_node **p = &sp->root.rb_node; + struct rb_node *parent = NULL; + struct sp_node *nd; + + while (*p) { + parent = *p; + nd = rb_entry(parent, struct sp_node, nd); + if (new->start < nd->start) + p = &(*p)->rb_left; + else if (new->end > nd->end) + p = &(*p)->rb_right; + else + BUG(); + } + rb_link_node(&new->nd, parent, p); + rb_insert_color(&new->nd, &sp->root); + pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, + new->policy ? new->policy->mode : 0); +} + +/* Find shared policy intersecting idx */ +struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + struct mempolicy *pol = NULL; + struct sp_node *sn; + + if (!sp->root.rb_node) + return NULL; + read_lock(&sp->lock); + sn = sp_lookup(sp, idx, idx+1); + if (sn) { + mpol_get(sn->policy); + pol = sn->policy; + } + read_unlock(&sp->lock); + return pol; +} + +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + +/** + * mpol_misplaced - check whether current page node is valid in policy + * + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped + * + * Lookup current policy node id for vma,addr and "compare to" page's + * node id. Policy determination "mimics" alloc_page_vma(). + * Called from fault path where we know the vma and faulting address. + * + * Return: NUMA_NO_NODE if the page is in a node that is valid for this + * policy, or a suitable node ID to allocate a replacement page from. + */ +int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol; + struct zoneref *z; + int curnid = page_to_nid(page); + unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); + int polnid = NUMA_NO_NODE; + int ret = NUMA_NO_NODE; + + pol = get_vma_policy(vma, addr); + if (!(pol->flags & MPOL_F_MOF)) + goto out; + + switch (pol->mode) { + case MPOL_INTERLEAVE: + pgoff = vma->vm_pgoff; + pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; + polnid = offset_il_node(pol, pgoff); + break; + + case MPOL_PREFERRED: + if (node_isset(curnid, pol->nodes)) + goto out; + polnid = first_node(pol->nodes); + break; + + case MPOL_LOCAL: + polnid = numa_node_id(); + break; + + case MPOL_BIND: + /* Optimize placement among multiple nodes via NUMA balancing */ + if (pol->flags & MPOL_F_MORON) { + if (node_isset(thisnid, pol->nodes)) + break; + goto out; + } + fallthrough; + + case MPOL_PREFERRED_MANY: + /* + * use current page if in policy nodemask, + * else select nearest allowed node, if any. + * If no allowed nodes, use current [!misplaced]. + */ + if (node_isset(curnid, pol->nodes)) + goto out; + z = first_zones_zonelist( + node_zonelist(numa_node_id(), GFP_HIGHUSER), + gfp_zone(GFP_HIGHUSER), + &pol->nodes); + polnid = zone_to_nid(z->zone); + break; + + default: + BUG(); + } + + /* Migrate the page towards the node whose CPU is referencing it */ + if (pol->flags & MPOL_F_MORON) { + polnid = thisnid; + + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) + goto out; + } + + if (curnid != polnid) + ret = polnid; +out: + mpol_cond_put(pol); + + return ret; +} + +/* + * Drop the (possibly final) reference to task->mempolicy. It needs to be + * dropped after task->mempolicy is set to NULL so that any allocation done as + * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed + * policy. + */ +void mpol_put_task_policy(struct task_struct *task) +{ + struct mempolicy *pol; + + task_lock(task); + pol = task->mempolicy; + task->mempolicy = NULL; + task_unlock(task); + mpol_put(pol); +} + +static void sp_delete(struct shared_policy *sp, struct sp_node *n) +{ + pr_debug("deleting %lx-l%lx\n", n->start, n->end); + rb_erase(&n->nd, &sp->root); + sp_free(n); +} + +static void sp_node_init(struct sp_node *node, unsigned long start, + unsigned long end, struct mempolicy *pol) +{ + node->start = start; + node->end = end; + node->policy = pol; +} + +static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) +{ + struct sp_node *n; + struct mempolicy *newpol; + + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n) + return NULL; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + sp_node_init(n, start, end, newpol); + + return n; +} + +/* Replace a policy range. */ +static int shared_policy_replace(struct shared_policy *sp, unsigned long start, + unsigned long end, struct sp_node *new) +{ + struct sp_node *n; + struct sp_node *n_new = NULL; + struct mempolicy *mpol_new = NULL; + int ret = 0; + +restart: + write_lock(&sp->lock); + n = sp_lookup(sp, start, end); + /* Take care of old policies in the same range. */ + while (n && n->start < end) { + struct rb_node *next = rb_next(&n->nd); + if (n->start >= start) { + if (n->end <= end) + sp_delete(sp, n); + else + n->start = end; + } else { + /* Old policy spanning whole new range. */ + if (n->end > end) { + if (!n_new) + goto alloc_new; + + *mpol_new = *n->policy; + atomic_set(&mpol_new->refcnt, 1); + sp_node_init(n_new, end, n->end, mpol_new); + n->end = start; + sp_insert(sp, n_new); + n_new = NULL; + mpol_new = NULL; + break; + } else + n->end = start; + } + if (!next) + break; + n = rb_entry(next, struct sp_node, nd); + } + if (new) + sp_insert(sp, new); + write_unlock(&sp->lock); + ret = 0; + +err_out: + if (mpol_new) + mpol_put(mpol_new); + if (n_new) + kmem_cache_free(sn_cache, n_new); + + return ret; + +alloc_new: + write_unlock(&sp->lock); + ret = -ENOMEM; + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); + if (!n_new) + goto err_out; + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); + if (!mpol_new) + goto err_out; + atomic_set(&mpol_new->refcnt, 1); + goto restart; +} + +/** + * mpol_shared_policy_init - initialize shared policy for inode + * @sp: pointer to inode shared policy + * @mpol: struct mempolicy to install + * + * Install non-NULL @mpol in inode's shared policy rb-tree. + * On entry, the current task has a reference on a non-NULL @mpol. + * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. + */ +void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) +{ + int ret; + + sp->root = RB_ROOT; /* empty tree == default mempolicy */ + rwlock_init(&sp->lock); + + if (mpol) { + struct vm_area_struct pvma; + struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + + if (!scratch) + goto put_mpol; + /* contextualize the tmpfs mount point mempolicy */ + new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(new)) + goto free_scratch; /* no valid nodemask intersection */ + + task_lock(current); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + task_unlock(current); + if (ret) + goto put_new; + + /* Create pseudo-vma that contains just the policy */ + vma_init(&pvma, NULL); + pvma.vm_end = TASK_SIZE; /* policy covers entire file */ + mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_new: + mpol_put(new); /* drop initial ref */ +free_scratch: + NODEMASK_SCRATCH_FREE(scratch); +put_mpol: + mpol_put(mpol); /* drop our incoming ref on sb mpol */ + } +} + +int mpol_set_shared_policy(struct shared_policy *info, + struct vm_area_struct *vma, struct mempolicy *npol) +{ + int err; + struct sp_node *new = NULL; + unsigned long sz = vma_pages(vma); + + pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", + vma->vm_pgoff, + sz, npol ? npol->mode : -1, + npol ? npol->flags : -1, + npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); + + if (npol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (!new) + return -ENOMEM; + } + err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + if (err && new) + sp_free(new); + return err; +} + +/* Free a backing policy store on inode delete. */ +void mpol_free_shared_policy(struct shared_policy *p) +{ + struct sp_node *n; + struct rb_node *next; + + if (!p->root.rb_node) + return; + write_lock(&p->lock); + next = rb_first(&p->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); + sp_delete(p, n); + } + write_unlock(&p->lock); +} + +#ifdef CONFIG_NUMA_BALANCING +static int __initdata numabalancing_override; + +static void __init check_numabalancing_enable(void) +{ + bool numabalancing_default = false; + + if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) + numabalancing_default = true; + + /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ + if (numabalancing_override) + set_numabalancing_state(numabalancing_override == 1); + + if (num_online_nodes() > 1 && !numabalancing_override) { + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", + numabalancing_default ? "Enabling" : "Disabling"); + set_numabalancing_state(numabalancing_default); + } +} + +static int __init setup_numabalancing(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + numabalancing_override = 1; + ret = 1; + } else if (!strcmp(str, "disable")) { + numabalancing_override = -1; + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse numa_balancing=\n"); + + return ret; +} +__setup("numa_balancing=", setup_numabalancing); +#else +static inline void __init check_numabalancing_enable(void) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + +/* assumes fs == KERNEL_DS */ +void __init numa_policy_init(void) +{ + nodemask_t interleave_nodes; + unsigned long largest = 0; + int nid, prefer = 0; + + policy_cache = kmem_cache_create("numa_policy", + sizeof(struct mempolicy), + 0, SLAB_PANIC, NULL); + + sn_cache = kmem_cache_create("shared_policy_node", + sizeof(struct sp_node), + 0, SLAB_PANIC, NULL); + + for_each_node(nid) { + preferred_node_policy[nid] = (struct mempolicy) { + .refcnt = ATOMIC_INIT(1), + .mode = MPOL_PREFERRED, + .flags = MPOL_F_MOF | MPOL_F_MORON, + .nodes = nodemask_of_node(nid), + }; + } + + /* + * Set interleaving policy for system init. Interleaving is only + * enabled across suitably sized nodes (default is >= 16MB), or + * fall back to the largest node if they're all smaller. + */ + nodes_clear(interleave_nodes); + for_each_node_state(nid, N_MEMORY) { + unsigned long total_pages = node_present_pages(nid); + + /* Preserve the largest node */ + if (largest < total_pages) { + largest = total_pages; + prefer = nid; + } + + /* Interleave this node? */ + if ((total_pages << PAGE_SHIFT) >= (16 << 20)) + node_set(nid, interleave_nodes); + } + + /* All too small, use the largest */ + if (unlikely(nodes_empty(interleave_nodes))) + node_set(prefer, interleave_nodes); + + if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) + pr_err("%s: interleaving failed\n", __func__); + + check_numabalancing_enable(); +} + +/* Reset policy of current process to default */ +void numa_default_policy(void) +{ + do_set_mempolicy(MPOL_DEFAULT, 0, NULL); +} + +/* + * Parse and format mempolicy from/to strings + */ + +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local", + [MPOL_PREFERRED_MANY] = "prefer (many)", +}; + + +#ifdef CONFIG_TMPFS +/** + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. + * @str: string containing mempolicy to parse + * @mpol: pointer to struct mempolicy pointer, returned on success. + * + * Format of input: + * [=][:] + * + * Return: %0 on success, else %1 + */ +int mpol_parse_str(char *str, struct mempolicy **mpol) +{ + struct mempolicy *new = NULL; + unsigned short mode_flags; + nodemask_t nodes; + char *nodelist = strchr(str, ':'); + char *flags = strchr(str, '='); + int err = 1, mode; + + if (flags) + *flags++ = '\0'; /* terminate mode string */ + + if (nodelist) { + /* NUL-terminate mode or flags string */ + *nodelist++ = '\0'; + if (nodelist_parse(nodelist, nodes)) + goto out; + if (!nodes_subset(nodes, node_states[N_MEMORY])) + goto out; + } else + nodes_clear(nodes); + + mode = match_string(policy_modes, MPOL_MAX, str); + if (mode < 0) + goto out; + + switch (mode) { + case MPOL_PREFERRED: + /* + * Insist on a nodelist of one node only, although later + * we use first_node(nodes) to grab a single node, so here + * nodelist (or nodes) cannot be empty. + */ + if (nodelist) { + char *rest = nodelist; + while (isdigit(*rest)) + rest++; + if (*rest) + goto out; + if (nodes_empty(nodes)) + goto out; + } + break; + case MPOL_INTERLEAVE: + /* + * Default to online nodes with memory if no nodelist + */ + if (!nodelist) + nodes = node_states[N_MEMORY]; + break; + case MPOL_LOCAL: + /* + * Don't allow a nodelist; mpol_new() checks flags + */ + if (nodelist) + goto out; + break; + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; + } + + mode_flags = 0; + if (flags) { + /* + * Currently, we only support two mutually exclusive + * mode flags. + */ + if (!strcmp(flags, "static")) + mode_flags |= MPOL_F_STATIC_NODES; + else if (!strcmp(flags, "relative")) + mode_flags |= MPOL_F_RELATIVE_NODES; + else + goto out; + } + + new = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR(new)) + goto out; + + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) { + new->nodes = nodes; + } else if (nodelist) { + nodes_clear(new->nodes); + node_set(first_node(nodes), new->nodes); + } else { + new->mode = MPOL_LOCAL; + } + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + + err = 0; + +out: + /* Restore string for error message */ + if (nodelist) + *--nodelist = ':'; + if (flags) + *--flags = '='; + if (!err) + *mpol = new; + return err; +} +#endif /* CONFIG_TMPFS */ + +/** + * mpol_to_str - format a mempolicy structure for printing + * @buffer: to contain formatted mempolicy string + * @maxlen: length of @buffer + * @pol: pointer to mempolicy to be formatted + * + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. + */ +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +{ + char *p = buffer; + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; + + if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { + mode = pol->mode; + flags = pol->flags; + } + + switch (mode) { + case MPOL_DEFAULT: + case MPOL_LOCAL: + break; + case MPOL_PREFERRED: + case MPOL_PREFERRED_MANY: + case MPOL_BIND: + case MPOL_INTERLEAVE: + nodes = pol->nodes; + break; + default: + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; + } + + p += snprintf(p, maxlen, "%s", policy_modes[mode]); + + if (flags & MPOL_MODE_FLAGS) { + p += snprintf(p, buffer + maxlen - p, "="); + + /* + * Currently, the only defined flags are mutually exclusive + */ + if (flags & MPOL_F_STATIC_NODES) + p += snprintf(p, buffer + maxlen - p, "static"); + else if (flags & MPOL_F_RELATIVE_NODES) + p += snprintf(p, buffer + maxlen - p, "relative"); + } + + if (!nodes_empty(nodes)) + p += scnprintf(p, buffer + maxlen - p, ":%*pbl", + nodemask_pr_args(&nodes)); +} diff --git a/mm/mempool.c b/mm/mempool.c new file mode 100644 index 000000000..96488b13a --- /dev/null +++ b/mm/mempool.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + * debugging by David Rientjes, Copyright (C) 2015 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "slab.h" + +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) +static void poison_error(mempool_t *pool, void *element, size_t size, + size_t byte) +{ + const int nr = pool->curr_nr; + const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); + const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); + int i; + + pr_err("BUG: mempool element poison mismatch\n"); + pr_err("Mempool %p size %zu\n", pool, size); + pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); + for (i = start; i < end; i++) + pr_cont("%x ", *(u8 *)(element + i)); + pr_cont("%s\n", end < size ? "..." : ""); + dump_stack(); +} + +static void __check_element(mempool_t *pool, void *element, size_t size) +{ + u8 *obj = element; + size_t i; + + for (i = 0; i < size; i++) { + u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; + + if (obj[i] != exp) { + poison_error(pool, element, size, i); + return; + } + } + memset(obj, POISON_INUSE, size); +} + +static void check_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { + __check_element(pool, element, ksize(element)); + } else if (pool->free == mempool_free_pages) { + /* Mempools backed by page allocator */ + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} + +static void __poison_element(void *element, size_t size) +{ + u8 *obj = element; + + memset(obj, POISON_FREE, size - 1); + obj[size - 1] = POISON_END; +} + +static void poison_element(mempool_t *pool, void *element) +{ + /* Mempools backed by slab allocator */ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { + __poison_element(element, ksize(element)); + } else if (pool->alloc == mempool_alloc_pages) { + /* Mempools backed by page allocator */ + int order = (int)(long)pool->pool_data; + void *addr = kmap_atomic((struct page *)element); + + __poison_element(addr, 1UL << (PAGE_SHIFT + order)); + kunmap_atomic(addr); + } +} +#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static inline void check_element(mempool_t *pool, void *element) +{ +} +static inline void poison_element(mempool_t *pool, void *element) +{ +} +#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ + +static __always_inline void kasan_poison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_slab_free_mempool(element); + else if (pool->alloc == mempool_alloc_pages) + kasan_poison_pages(element, (unsigned long)pool->pool_data, + false); +} + +static void kasan_unpoison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_unpoison_range(element, __ksize(element)); + else if (pool->alloc == mempool_alloc_pages) + kasan_unpoison_pages(element, (unsigned long)pool->pool_data, + false); +} + +static __always_inline void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + poison_element(pool, element); + kasan_poison_element(pool, element); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + void *element = pool->elements[--pool->curr_nr]; + + BUG_ON(pool->curr_nr < 0); + kasan_unpoison_element(pool, element); + check_element(pool, element); + return element; +} + +/** + * mempool_exit - exit a mempool initialized with mempool_init() + * @pool: pointer to the memory pool which was initialized with + * mempool_init(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + * + * May be called on a zeroed but uninitialized mempool (i.e. allocated with + * kzalloc()). + */ +void mempool_exit(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + pool->elements = NULL; +} +EXPORT_SYMBOL(mempool_exit); + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + */ +void mempool_destroy(mempool_t *pool) +{ + if (unlikely(!pool)) + return; + + mempool_exit(pool); + kfree(pool); +} +EXPORT_SYMBOL(mempool_destroy); + +int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + pool->alloc = alloc_fn; + pool->free = free_fn; + init_waitqueue_head(&pool->wait); + + pool->elements = kmalloc_array_node(min_nr, sizeof(void *), + gfp_mask, node_id); + if (!pool->elements) + return -ENOMEM; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(gfp_mask, pool->pool_data); + if (unlikely(!element)) { + mempool_exit(pool); + return -ENOMEM; + } + add_element(pool, element); + } + + return 0; +} +EXPORT_SYMBOL(mempool_init_node); + +/** + * mempool_init - initialize a memory pool + * @pool: pointer to the memory pool that should be initialized + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * Like mempool_create(), but initializes the pool in (i.e. embedded in another + * structure). + * + * Return: %0 on success, negative error code otherwise. + */ +int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_init_node(pool, min_nr, alloc_fn, free_fn, + pool_data, GFP_KERNEL, NUMA_NO_NODE); + +} +EXPORT_SYMBOL(mempool_init); + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc() and mempool_free() + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc() function is not called + * from IRQ contexts. + * + * Return: pointer to the created memory pool object or %NULL on error. + */ +mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, + GFP_KERNEL, NUMA_NO_NODE); +} +EXPORT_SYMBOL(mempool_create); + +mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) +{ + mempool_t *pool; + + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); + if (!pool) + return NULL; + + if (mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, + gfp_mask, node_id)) { + kfree(pool); + return NULL; + } + + return pool; +} +EXPORT_SYMBOL(mempool_create_node); + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + * + * Return: %0 on success, negative error code otherwise. + */ +int mempool_resize(mempool_t *pool, int new_min_nr) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + might_sleep(); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr <= pool->min_nr) { + while (new_min_nr < pool->curr_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(new_min_nr <= pool->min_nr)) { + /* Raced, other resize will do our work */ + spin_unlock_irqrestore(&pool->lock, flags); + kfree(new_elements); + goto out; + } + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + } else { + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); /* Raced */ + goto out; + } + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} +EXPORT_SYMBOL(mempool_resize); + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn() function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. + * + * Return: pointer to the allocated element or %NULL on error. + */ +void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +{ + void *element; + unsigned long flags; + wait_queue_entry_t wait; + gfp_t gfp_temp; + + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_alloc(gfp_mask); + + gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ + gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ + gfp_mask |= __GFP_NOWARN; /* failures are OK */ + + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); + +repeat_alloc: + + element = pool->alloc(gfp_temp, pool->pool_data); + if (likely(element != NULL)) + return element; + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); + /* + * Update the allocation stack trace as this is more useful + * for debugging. + */ + kmemleak_update_trace(element); + return element; + } + + /* + * We use gfp mask w/o direct reclaim or IO for the first round. If + * alloc failed with that and @pool was empty, retry immediately. + */ + if (gfp_temp != gfp_mask) { + spin_unlock_irqrestore(&pool->lock, flags); + gfp_temp = gfp_mask; + goto repeat_alloc; + } + + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { + spin_unlock_irqrestore(&pool->lock, flags); + return NULL; + } + + /* Let's wait for someone else to return an element to @pool */ + init_wait(&wait); + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * FIXME: this should be io_schedule(). The timeout is there as a + * workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + + finish_wait(&pool->wait, &wait); + goto repeat_alloc; +} +EXPORT_SYMBOL(mempool_alloc); + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (unlikely(element == NULL)) + return; + + /* + * Paired with the wmb in mempool_alloc(). The preceding read is + * for @element and the following @pool->curr_nr. This ensures + * that the visible value of @pool->curr_nr is from after the + * allocation of @element. This is necessary for fringe cases + * where @element was passed to this task without going through + * barriers. + * + * For example, assume @p is %NULL at the beginning and one task + * performs "p = mempool_alloc(...);" while another task is doing + * "while (!p) cpu_relax(); mempool_free(p, ...);". This function + * may end up using curr_nr value which is from before allocation + * of @p without the following rmb. + */ + smp_rmb(); + + /* + * For correctness, we need a test which is guaranteed to trigger + * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr + * without locking achieves that and refilling as soon as possible + * is desirable. + * + * Because curr_nr visible here is always a value after the + * allocation of @element, any task which decremented curr_nr below + * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets + * incremented to min_nr afterwards. If curr_nr gets incremented + * to min_nr after the allocation of @element, the elements + * allocated after that are subject to the same guarantee. + * + * Waiters happen iff curr_nr is 0 and the above guarantee also + * ensures that there will be frees which return elements to the + * pool waking up the waiters. + */ + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr < pool->min_nr)) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} +EXPORT_SYMBOL(mempool_free); + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + VM_BUG_ON(mem->ctor); + return kmem_cache_alloc(mem, gfp_mask); +} +EXPORT_SYMBOL(mempool_alloc_slab); + +void mempool_free_slab(void *element, void *pool_data) +{ + struct kmem_cache *mem = pool_data; + kmem_cache_free(mem, element); +} +EXPORT_SYMBOL(mempool_free_slab); + +/* + * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory + * specified by pool_data + */ +void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kmalloc); + +void mempool_kfree(void *element, void *pool_data) +{ + kfree(element); +} +EXPORT_SYMBOL(mempool_kfree); + +/* + * A simple mempool-backed page allocator that allocates pages + * of the order specified by pool_data. + */ +void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) +{ + int order = (int)(long)pool_data; + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL(mempool_alloc_pages); + +void mempool_free_pages(void *element, void *pool_data) +{ + int order = (int)(long)pool_data; + __free_pages(element, order); +} +EXPORT_SYMBOL(mempool_free_pages); diff --git a/mm/memremap.c b/mm/memremap.c new file mode 100644 index 000000000..08cbf54fe --- /dev/null +++ b/mm/memremap.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static DEFINE_XARRAY(pgmap_array); + +/* + * The memremap() and memremap_pages() interfaces are alternately used + * to map persistent memory namespaces. These interfaces place different + * constraints on the alignment and size of the mapping (namespace). + * memremap() can map individual PAGE_SIZE pages. memremap_pages() can + * only map subsections (2MB), and at least one architecture (PowerPC) + * the minimum mapping granularity of memremap_pages() is 16MB. + * + * The role of memremap_compat_align() is to communicate the minimum + * arch supported alignment of a namespace such that it can freely + * switch modes without violating the arch constraint. Namely, do not + * allow a namespace to be PAGE_SIZE aligned since that namespace may be + * reconfigured into a mode that requires SUBSECTION_SIZE alignment. + */ +#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN +unsigned long memremap_compat_align(void) +{ + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + +#ifdef CONFIG_FS_DAX +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); + +static void devmap_managed_enable_put(struct dev_pagemap *pgmap) +{ + if (pgmap->type == MEMORY_DEVICE_FS_DAX) + static_branch_dec(&devmap_managed_key); +} + +static void devmap_managed_enable_get(struct dev_pagemap *pgmap) +{ + if (pgmap->type == MEMORY_DEVICE_FS_DAX) + static_branch_inc(&devmap_managed_key); +} +#else +static void devmap_managed_enable_get(struct dev_pagemap *pgmap) +{ +} +static void devmap_managed_enable_put(struct dev_pagemap *pgmap) +{ +} +#endif /* CONFIG_FS_DAX */ + +static void pgmap_array_delete(struct range *range) +{ + xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end), + NULL, GFP_KERNEL); + synchronize_rcu(); +} + +static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id) +{ + struct range *range = &pgmap->ranges[range_id]; + unsigned long pfn = PHYS_PFN(range->start); + + if (range_id) + return pfn; + return pfn + vmem_altmap_offset(pgmap_altmap(pgmap)); +} + +bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) +{ + int i; + + for (i = 0; i < pgmap->nr_range; i++) { + struct range *range = &pgmap->ranges[i]; + + if (pfn >= PHYS_PFN(range->start) && + pfn <= PHYS_PFN(range->end)) + return pfn >= pfn_first(pgmap, i); + } + + return false; +} + +static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) +{ + const struct range *range = &pgmap->ranges[range_id]; + + return (range->start + range_len(range)) >> PAGE_SHIFT; +} + +static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) +{ + return (pfn_end(pgmap, range_id) - + pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; +} + +static void pageunmap_range(struct dev_pagemap *pgmap, int range_id) +{ + struct range *range = &pgmap->ranges[range_id]; + struct page *first_page; + + /* make sure to access a memmap that was actually initialized */ + first_page = pfn_to_page(pfn_first(pgmap, range_id)); + + /* pages are dead and unused, undo the arch mapping */ + mem_hotplug_begin(); + remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start), + PHYS_PFN(range_len(range))); + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + __remove_pages(PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), NULL); + } else { + arch_remove_memory(range->start, range_len(range), + pgmap_altmap(pgmap)); + kasan_remove_zero_shadow(__va(range->start), range_len(range)); + } + mem_hotplug_done(); + + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); + pgmap_array_delete(range); +} + +void memunmap_pages(struct dev_pagemap *pgmap) +{ + int i; + + percpu_ref_kill(&pgmap->ref); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + for (i = 0; i < pgmap->nr_range; i++) + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + + wait_for_completion(&pgmap->done); + + for (i = 0; i < pgmap->nr_range; i++) + pageunmap_range(pgmap, i); + percpu_ref_exit(&pgmap->ref); + + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); + devmap_managed_enable_put(pgmap); +} +EXPORT_SYMBOL_GPL(memunmap_pages); + +static void devm_memremap_pages_release(void *data) +{ + memunmap_pages(data); +} + +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref); + + complete(&pgmap->done); +} + +static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, + int range_id, int nid) +{ + const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE; + struct range *range = &pgmap->ranges[range_id]; + struct dev_pagemap *conflict_pgmap; + int error, is_ram; + + if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0, + "altmap not supported for multiple ranges\n")) + return -EINVAL; + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + if (conflict_pgmap) { + WARN(1, "Conflicting mapping in same section\n"); + put_dev_pagemap(conflict_pgmap); + return -ENOMEM; + } + + is_ram = region_intersects(range->start, range_len(range), + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + + if (is_ram != REGION_DISJOINT) { + WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n", + is_ram == REGION_MIXED ? "mixed" : "ram", + range->start, range->end); + return -ENXIO; + } + + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start), + PHYS_PFN(range->end), pgmap, GFP_KERNEL)); + if (error) + return error; + + if (nid < 0) + nid = numa_mem_id(); + + error = track_pfn_remap(NULL, ¶ms->pgprot, PHYS_PFN(range->start), 0, + range_len(range)); + if (error) + goto err_pfn_remap; + + if (!mhp_range_allowed(range->start, range_len(range), !is_private)) { + error = -EINVAL; + goto err_kasan; + } + + mem_hotplug_begin(); + + /* + * For device private memory we call add_pages() as we only need to + * allocate and initialize struct page for the device memory. More- + * over the device memory is un-accessible thus we do not want to + * create a linear mapping for the memory like arch_add_memory() + * would do. + * + * For all other device memory types, which are accessible by + * the CPU, we do want the linear mapping and thus use + * arch_add_memory(). + */ + if (is_private) { + error = add_pages(nid, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params); + } else { + error = kasan_add_zero_shadow(__va(range->start), range_len(range)); + if (error) { + mem_hotplug_done(); + goto err_kasan; + } + + error = arch_add_memory(nid, range->start, range_len(range), + params); + } + + if (!error) { + struct zone *zone; + + zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; + move_pfn_range_to_zone(zone, PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), params->altmap, + MIGRATE_MOVABLE); + } + + mem_hotplug_done(); + if (error) + goto err_add_memory; + + /* + * Initialization of the pages has been deferred until now in order + * to allow us to do the work while not holding the hotplug lock. + */ + memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + PHYS_PFN(range->start), + PHYS_PFN(range_len(range)), pgmap); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); + return 0; + +err_add_memory: + if (!is_private) + kasan_remove_zero_shadow(__va(range->start), range_len(range)); +err_kasan: + untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range)); +err_pfn_remap: + pgmap_array_delete(range); + return error; +} + + +/* + * Not device managed version of devm_memremap_pages, undone by + * memunmap_pages(). Please use devm_memremap_pages if you have a struct + * device available. + */ +void *memremap_pages(struct dev_pagemap *pgmap, int nid) +{ + struct mhp_params params = { + .altmap = pgmap_altmap(pgmap), + .pgmap = pgmap, + .pgprot = PAGE_KERNEL, + }; + const int nr_range = pgmap->nr_range; + int error, i; + + if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) + return ERR_PTR(-EINVAL); + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_COHERENT: + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + params.pgprot = pgprot_decrypted(params.pgprot); + break; + case MEMORY_DEVICE_GENERIC: + break; + case MEMORY_DEVICE_PCI_P2PDMA: + params.pgprot = pgprot_noncached(params.pgprot); + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0, + GFP_KERNEL); + if (error) + return ERR_PTR(error); + + devmap_managed_enable_get(pgmap); + + /* + * Clear the pgmap nr_range as it will be incremented for each + * successfully processed range. This communicates how many + * regions to unwind in the abort case. + */ + pgmap->nr_range = 0; + error = 0; + for (i = 0; i < nr_range; i++) { + error = pagemap_range(pgmap, ¶ms, i, nid); + if (error) + break; + pgmap->nr_range++; + } + + if (i < nr_range) { + memunmap_pages(pgmap); + pgmap->nr_range = nr_range; + return ERR_PTR(error); + } + + return __va(pgmap->ranges[0].start); +} +EXPORT_SYMBOL_GPL(memremap_pages); + +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @pgmap: pointer to a struct dev_pagemap + * + * Notes: + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function + * + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. + * + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at + * devm_memremap_pages_release() time, or if this routine fails. + * + * 4/ range is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) +{ + int error; + void *ret; + + ret = memremap_pages(pgmap, dev_to_node(dev)); + if (IS_ERR(ret)) + return ret; + + error = devm_add_action_or_reset(dev, devm_memremap_pages_release, + pgmap); + if (error) + return ERR_PTR(error); + return ret; +} +EXPORT_SYMBOL_GPL(devm_memremap_pages); + +void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) +{ + devm_release_action(dev, devm_memremap_pages_release, pgmap); +} +EXPORT_SYMBOL_GPL(devm_memunmap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + if (altmap) + return altmap->reserve + altmap->free; + return 0; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap + * is non-NULL but does not cover @pfn the reference to it will be released. + */ +struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) +{ + resource_size_t phys = PFN_PHYS(pfn); + + /* + * In the cached case we're already holding a live reference. + */ + if (pgmap) { + if (phys >= pgmap->range.start && phys <= pgmap->range.end) + return pgmap; + put_dev_pagemap(pgmap); + } + + /* fall back to slow path lookup */ + rcu_read_lock(); + pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); + if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) + pgmap = NULL; + rcu_read_unlock(); + + return pgmap; +} +EXPORT_SYMBOL_GPL(get_dev_pagemap); + +void free_zone_device_page(struct page *page) +{ + if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free)) + return; + + mem_cgroup_uncharge(page_folio(page)); + + /* + * Note: we don't expect anonymous compound pages yet. Once supported + * and we could PTE-map them similar to THP, we'd have to clear + * PG_anon_exclusive on all tail pages. + */ + VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page); + if (PageAnon(page)) + __ClearPageAnonExclusive(page); + + /* + * When a device managed page is freed, the page->mapping field + * may still contain a (stale) mapping value. For example, the + * lower bits of page->mapping may still identify the page as an + * anonymous page. Ultimately, this entire field is just stale + * and wrong, and it will cause errors if not cleared. One + * example is: + * + * migrate_vma_pages() + * migrate_vma_insert_page() + * page_add_new_anon_rmap() + * __page_set_anon_rmap() + * ...checks page->mapping, via PageAnon(page) call, + * and incorrectly concludes that the page is an + * anonymous page. Therefore, it incorrectly, + * silently fails to set up the new anon rmap. + * + * For other types of ZONE_DEVICE pages, migration is either + * handled differently or not done at all, so there is no need + * to clear page->mapping. + */ + page->mapping = NULL; + page->pgmap->ops->page_free(page); + + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + /* + * Reset the page count to 1 to prepare for handing out the page + * again. + */ + set_page_count(page, 1); + else + put_dev_pagemap(page->pgmap); +} + +void zone_device_page_init(struct page *page) +{ + /* + * Drivers shouldn't be allocating pages after calling + * memunmap_pages(). + */ + WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref)); + set_page_count(page, 1); + lock_page(page); +} +EXPORT_SYMBOL_GPL(zone_device_page_init); + +#ifdef CONFIG_FS_DAX +bool __put_devmap_managed_page_refs(struct page *page, int refs) +{ + if (page->pgmap->type != MEMORY_DEVICE_FS_DAX) + return false; + + /* + * fsdax page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (page_ref_sub_return(page, refs) == 1) + wake_up_var(&page->_refcount); + return true; +} +EXPORT_SYMBOL(__put_devmap_managed_page_refs); +#endif /* CONFIG_FS_DAX */ diff --git a/mm/memtest.c b/mm/memtest.c new file mode 100644 index 000000000..f53ace709 --- /dev/null +++ b/mm/memtest.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ +}; + +static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) +{ + pr_info(" %016llx bad mem addr %pa - %pa reserved\n", + cpu_to_be64(pattern), &start_bad, &end_bad); + memblock_reserve(start_bad, end_bad - start_bad); +} + +static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) +{ + u64 *p, *start, *end; + phys_addr_t start_bad, last_bad; + phys_addr_t start_phys_aligned; + const size_t incr = sizeof(pattern); + + start_phys_aligned = ALIGN(start_phys, incr); + start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; + start_bad = 0; + last_bad = 0; + + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); +} + +static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) +{ + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start, + &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + if (this_start < this_end) { + pr_info(" %pa - %pa pattern %016llx\n", + &this_start, &this_end, cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } + } +} + +/* default is disabled */ +static unsigned int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + int ret = 0; + + if (arg) + ret = kstrtouint(arg, 0, &memtest_pattern); + else + memtest_pattern = ARRAY_SIZE(patterns); + + return ret; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(phys_addr_t start, phys_addr_t end) +{ + unsigned int i; + unsigned int idx = 0; + + if (!memtest_pattern) + return; + + pr_info("early_memtest: # of tests: %u\n", memtest_pattern); + for (i = memtest_pattern-1; i < UINT_MAX; --i) { + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } +} diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 000000000..c93dd6a31 --- /dev/null +++ b/mm/migrate.c @@ -0,0 +1,2237 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Memory Migration functionality - linux/mm/migrate.c + * + * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter + * + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "internal.h" + +int isolate_movable_page(struct page *page, isolate_mode_t mode) +{ + const struct movable_operations *mops; + + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a movable page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (unlikely(!get_page_unless_zero(page))) + goto out; + + /* + * Check PageMovable before holding a PG_lock because page's owner + * assumes anybody doesn't touch PG_lock of newly allocated page + * so unconditionally grabbing the lock ruins page's owner side. + */ + if (unlikely(!__PageMovable(page))) + goto out_putpage; + /* + * As movable pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the releasing a page. + * + * In order to avoid having an already isolated movable page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released, + * lets be sure we have the page lock + * before proceeding with the movable page isolation steps. + */ + if (unlikely(!trylock_page(page))) + goto out_putpage; + + if (!PageMovable(page) || PageIsolated(page)) + goto out_no_isolated; + + mops = page_movable_ops(page); + VM_BUG_ON_PAGE(!mops, page); + + if (!mops->isolate_page(page, mode)) + goto out_no_isolated; + + /* Driver shouldn't use PG_isolated bit of page->flags */ + WARN_ON_ONCE(PageIsolated(page)); + SetPageIsolated(page); + unlock_page(page); + + return 0; + +out_no_isolated: + unlock_page(page); +out_putpage: + put_page(page); +out: + return -EBUSY; +} + +static void putback_movable_page(struct page *page) +{ + const struct movable_operations *mops = page_movable_ops(page); + + mops->putback_page(page); + ClearPageIsolated(page); +} + +/* + * Put previously isolated pages back onto the appropriate lists + * from where they were once taken off for compaction/migration. + * + * This function shall be used whenever the isolated pageset has been + * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() + * and isolate_hugetlb(). + */ +void putback_movable_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + + list_for_each_entry_safe(page, page2, l, lru) { + if (unlikely(PageHuge(page))) { + putback_active_hugepage(page); + continue; + } + list_del(&page->lru); + /* + * We isolated non-lru movable page so here we can use + * __PageMovable because LRU page's mapping cannot have + * PAGE_MAPPING_MOVABLE. + */ + if (unlikely(__PageMovable(page))) { + VM_BUG_ON_PAGE(!PageIsolated(page), page); + lock_page(page); + if (PageMovable(page)) + putback_movable_page(page); + else + ClearPageIsolated(page); + unlock_page(page); + put_page(page); + } else { + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_lru(page), -thp_nr_pages(page)); + putback_lru_page(page); + } + } +} + +/* + * Restore a potential migration pte to a working pte entry + */ +static bool remove_migration_pte(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr, void *old) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + + while (page_vma_mapped_walk(&pvmw)) { + rmap_t rmap_flags = RMAP_NONE; + pte_t pte; + swp_entry_t entry; + struct page *new; + unsigned long idx = 0; + + /* pgoff is invalid for ksm pages, but they are never large */ + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; + new = folio_page(folio, idx); + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || + !folio_test_pmd_mappable(folio), folio); + remove_migration_pmd(&pvmw, new); + continue; + } +#endif + + folio_get(folio); + pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_mksoft_dirty(pte); + + /* + * Recheck VMA as permissions can change since migration started + */ + entry = pte_to_swp_entry(*pvmw.pte); + if (!is_migration_entry_young(entry)) + pte = pte_mkold(pte); + if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) + pte = pte_mkdirty(pte); + if (is_writable_migration_entry(entry)) + pte = maybe_mkwrite(pte, vma); + else if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); + else + pte = pte_wrprotect(pte); + + if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) + rmap_flags |= RMAP_EXCLUSIVE; + + if (unlikely(is_device_private_page(new))) { + if (pte_write(pte)) + entry = make_writable_device_private_entry( + page_to_pfn(new)); + else + entry = make_readable_device_private_entry( + page_to_pfn(new)); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_swp_mkuffd_wp(pte); + } + +#ifdef CONFIG_HUGETLB_PAGE + if (folio_test_hugetlb(folio)) { + unsigned int shift = huge_page_shift(hstate_vma(vma)); + + pte = pte_mkhuge(pte); + pte = arch_make_huge_pte(pte, shift, vma->vm_flags); + if (folio_test_anon(folio)) + hugepage_add_anon_rmap(new, vma, pvmw.address, + rmap_flags); + else + page_dup_file_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + } else +#endif + { + if (folio_test_anon(folio)) + page_add_anon_rmap(new, vma, pvmw.address, + rmap_flags); + else + page_add_file_rmap(new, vma, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + } + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain_local(); + + trace_remove_migration_pte(pvmw.address, pte_val(pte), + compound_order(new)); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, pvmw.address, pvmw.pte); + } + + return true; +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + */ +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +{ + struct rmap_walk_control rwc = { + .rmap_one = remove_migration_pte, + .arg = src, + }; + + if (locked) + rmap_walk_locked(dst, &rwc); + else + rmap_walk(dst, &rwc); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + */ +void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) +{ + pte_t pte; + swp_entry_t entry; + + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + migration_entry_wait_on_locked(entry, ptep, ptl); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +#ifdef CONFIG_HUGETLB_PAGE +void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) +{ + pte_t pte; + + spin_lock(ptl); + pte = huge_ptep_get(ptep); + + if (unlikely(!is_hugetlb_entry_migration(pte))) + spin_unlock(ptl); + else + migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); +} + +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) +{ + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); + + __migration_entry_wait_huge(pte, ptl); +} +#endif + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl; + + ptl = pmd_lock(mm, pmd); + if (!is_pmd_migration_entry(*pmd)) + goto unlock; + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); + return; +unlock: + spin_unlock(ptl); +} +#endif + +static int folio_expected_refs(struct address_space *mapping, + struct folio *folio) +{ + int refs = 1; + if (!mapping) + return refs; + + refs += folio_nr_pages(folio); + if (folio_test_private(folio)) + refs++; + + return refs; +} + +/* + * Replace the page in the mapping. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. + */ +int folio_migrate_mapping(struct address_space *mapping, + struct folio *newfolio, struct folio *folio, int extra_count) +{ + XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct zone *oldzone, *newzone; + int dirty; + int expected_count = folio_expected_refs(mapping, folio) + extra_count; + long nr = folio_nr_pages(folio); + long entries, i; + + if (!mapping) { + /* Anonymous page without mapping */ + if (folio_ref_count(folio) != expected_count) + return -EAGAIN; + + /* No turning back from here */ + newfolio->index = folio->index; + newfolio->mapping = folio->mapping; + if (folio_test_swapbacked(folio)) + __folio_set_swapbacked(newfolio); + + return MIGRATEPAGE_SUCCESS; + } + + oldzone = folio_zone(folio); + newzone = folio_zone(newfolio); + + xas_lock_irq(&xas); + if (!folio_ref_freeze(folio, expected_count)) { + xas_unlock_irq(&xas); + return -EAGAIN; + } + + /* + * Now we know that no one else is looking at the folio: + * no turning back from here. + */ + newfolio->index = folio->index; + newfolio->mapping = folio->mapping; + folio_ref_add(newfolio, nr); /* add cache reference */ + if (folio_test_swapbacked(folio)) { + __folio_set_swapbacked(newfolio); + if (folio_test_swapcache(folio)) { + folio_set_swapcache(newfolio); + newfolio->private = folio_get_private(folio); + } + entries = nr; + } else { + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + entries = 1; + } + + /* Move dirty while page refs frozen and newpage not yet exposed */ + dirty = folio_test_dirty(folio); + if (dirty) { + folio_clear_dirty(folio); + folio_set_dirty(newfolio); + } + + /* Swap cache still stores N entries instead of a high-order entry */ + for (i = 0; i < entries; i++) { + xas_store(&xas, newfolio); + xas_next(&xas); + } + + /* + * Drop cache reference from old page by unfreezing + * to one less reference. + * We know this isn't the last reference. + */ + folio_ref_unfreeze(folio, expected_count - nr); + + xas_unlock(&xas); + /* Leave irq disabled to prevent preemption while updating stats */ + + /* + * If moved to a different zone then also account + * the page for that zone. Other VM counters will be + * taken care of when we establish references to the + * new page and drop references to the old page. + * + * Note that anonymous pages are accounted for + * via NR_FILE_PAGES and NR_ANON_MAPPED if they + * are mapped to swap space. + */ + if (newzone != oldzone) { + struct lruvec *old_lruvec, *new_lruvec; + struct mem_cgroup *memcg; + + memcg = folio_memcg(folio); + old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); + new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); + + __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); + if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { + __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); + __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); + } +#ifdef CONFIG_SWAP + if (folio_test_swapcache(folio)) { + __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + } +#endif + if (dirty && mapping_can_writeback(mapping)) { + __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); + __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); + __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); + __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); + } + } + local_irq_enable(); + + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL(folio_migrate_mapping); + +/* + * The expected number of remaining references is the same as that + * of folio_migrate_mapping(). + */ +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct folio *dst, struct folio *src) +{ + XA_STATE(xas, &mapping->i_pages, folio_index(src)); + int expected_count; + + xas_lock_irq(&xas); + expected_count = 2 + folio_has_private(src); + if (!folio_ref_freeze(src, expected_count)) { + xas_unlock_irq(&xas); + return -EAGAIN; + } + + dst->index = src->index; + dst->mapping = src->mapping; + + folio_get(dst); + + xas_store(&xas, dst); + + folio_ref_unfreeze(src, expected_count - 1); + + xas_unlock_irq(&xas); + + return MIGRATEPAGE_SUCCESS; +} + +/* + * Copy the flags and some other ancillary information + */ +void folio_migrate_flags(struct folio *newfolio, struct folio *folio) +{ + int cpupid; + + if (folio_test_error(folio)) + folio_set_error(newfolio); + if (folio_test_referenced(folio)) + folio_set_referenced(newfolio); + if (folio_test_uptodate(folio)) + folio_mark_uptodate(newfolio); + if (folio_test_clear_active(folio)) { + VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); + folio_set_active(newfolio); + } else if (folio_test_clear_unevictable(folio)) + folio_set_unevictable(newfolio); + if (folio_test_workingset(folio)) + folio_set_workingset(newfolio); + if (folio_test_checked(folio)) + folio_set_checked(newfolio); + /* + * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via + * migration entries. We can still have PG_anon_exclusive set on an + * effectively unmapped and unreferenced first sub-pages of an + * anonymous THP: we can simply copy it here via PG_mappedtodisk. + */ + if (folio_test_mappedtodisk(folio)) + folio_set_mappedtodisk(newfolio); + + /* Move dirty on pages not done by folio_migrate_mapping() */ + if (folio_test_dirty(folio)) + folio_set_dirty(newfolio); + + if (folio_test_young(folio)) + folio_set_young(newfolio); + if (folio_test_idle(folio)) + folio_set_idle(newfolio); + + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(&folio->page, -1); + /* + * For memory tiering mode, when migrate between slow and fast + * memory node, reset cpupid, because that is used to record + * page access time in slow memory node. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { + bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); + bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); + + if (f_toptier != t_toptier) + cpupid = -1; + } + page_cpupid_xchg_last(&newfolio->page, cpupid); + + folio_migrate_ksm(newfolio, folio); + /* + * Please do not reorder this without considering how mm/ksm.c's + * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). + */ + if (folio_test_swapcache(folio)) + folio_clear_swapcache(folio); + folio_clear_private(folio); + + /* page->private contains hugetlb specific flags */ + if (!folio_test_hugetlb(folio)) + folio->private = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (folio_test_writeback(newfolio)) + folio_end_writeback(newfolio); + + /* + * PG_readahead shares the same bit with PG_reclaim. The above + * end_page_writeback() may clear PG_readahead mistakenly, so set the + * bit after that. + */ + if (folio_test_readahead(folio)) + folio_set_readahead(newfolio); + + folio_copy_owner(newfolio, folio); + + if (!folio_test_hugetlb(folio)) + mem_cgroup_migrate(folio, newfolio); +} +EXPORT_SYMBOL(folio_migrate_flags); + +void folio_migrate_copy(struct folio *newfolio, struct folio *folio) +{ + folio_copy(newfolio, folio); + folio_migrate_flags(newfolio, folio); +} +EXPORT_SYMBOL(folio_migrate_copy); + +/************************************************************ + * Migration functions + ***********************************************************/ + +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + +/** + * migrate_folio() - Simple folio migration. + * @mapping: The address_space containing the folio. + * @dst: The folio to migrate the data to. + * @src: The folio containing the current data. + * @mode: How to migrate the page. + * + * Common logic to directly migrate a single LRU folio suitable for + * folios that do not use PagePrivate/PagePrivate2. + * + * Folios are locked upon entry and exit. + */ +int migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) +{ + return migrate_folio_extra(mapping, dst, src, mode, 0); +} +EXPORT_SYMBOL(migrate_folio); + +#ifdef CONFIG_BLOCK +/* Returns true if all buffers are successfully locked */ +static bool buffer_migrate_lock_buffers(struct buffer_head *head, + enum migrate_mode mode) +{ + struct buffer_head *bh = head; + + /* Simple case, sync compaction */ + if (mode != MIGRATE_ASYNC) { + do { + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return true; + } + + /* async case, we cannot block on lock_buffer so use trylock_buffer */ + do { + if (!trylock_buffer(bh)) { + /* + * We failed to lock the buffer and cannot stall in + * async migration. Release the taken locks + */ + struct buffer_head *failed_bh = bh; + bh = head; + while (bh != failed_bh) { + unlock_buffer(bh); + bh = bh->b_this_page; + } + return false; + } + + bh = bh->b_this_page; + } while (bh != head); + return true; +} + +static int __buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode, + bool check_refs) +{ + struct buffer_head *bh, *head; + int rc; + int expected_count; + + head = folio_buffers(src); + if (!head) + return migrate_folio(mapping, dst, src, mode); + + /* Check whether page does not have extra refs before we do more work */ + expected_count = folio_expected_refs(mapping, src); + if (folio_ref_count(src) != expected_count) + return -EAGAIN; + + if (!buffer_migrate_lock_buffers(head, mode)) + return -EAGAIN; + + if (check_refs) { + bool busy; + bool invalidated = false; + +recheck_buffers: + busy = false; + spin_lock(&mapping->private_lock); + bh = head; + do { + if (atomic_read(&bh->b_count)) { + busy = true; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (busy) { + if (invalidated) { + rc = -EAGAIN; + goto unlock_buffers; + } + spin_unlock(&mapping->private_lock); + invalidate_bh_lrus(); + invalidated = true; + goto recheck_buffers; + } + } + + rc = folio_migrate_mapping(mapping, dst, src, 0); + if (rc != MIGRATEPAGE_SUCCESS) + goto unlock_buffers; + + folio_attach_private(dst, folio_detach_private(src)); + + bh = head; + do { + set_bh_page(bh, &dst->page, bh_offset(bh)); + bh = bh->b_this_page; + } while (bh != head); + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + + rc = MIGRATEPAGE_SUCCESS; +unlock_buffers: + if (check_refs) + spin_unlock(&mapping->private_lock); + bh = head; + do { + unlock_buffer(bh); + bh = bh->b_this_page; + } while (bh != head); + + return rc; +} + +/** + * buffer_migrate_folio() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * This function can only be used if the underlying filesystem guarantees + * that no other references to @src exist. For example attached buffer + * heads are accessed only under the folio lock. If your filesystem cannot + * provide this guarantee, buffer_migrate_folio_norefs() may be more + * appropriate. + * + * Return: 0 on success or a negative errno on failure. + */ +int buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + return __buffer_migrate_folio(mapping, dst, src, mode, false); +} +EXPORT_SYMBOL(buffer_migrate_folio); + +/** + * buffer_migrate_folio_norefs() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * Like buffer_migrate_folio() except that this variant is more careful + * and checks that there are also no buffer head references. This function + * is the right one for mappings where buffer heads are directly looked + * up and referenced (such as block device mappings). + * + * Return: 0 on success or a negative errno on failure. + */ +int buffer_migrate_folio_norefs(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + return __buffer_migrate_folio(mapping, dst, src, mode, true); +} +#endif + +int filemap_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + int ret; + + ret = folio_migrate_mapping(mapping, dst, src, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (folio_get_private(src)) + folio_attach_private(dst, folio_detach_private(src)); + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL_GPL(filemap_migrate_folio); + +/* + * Writeback a folio to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct folio *folio) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!folio_clear_dirty_for_io(folio)) + /* Someone else already triggered a write */ + return -EAGAIN; + + /* + * A dirty folio may imply that the underlying filesystem has + * the folio on some queue. So the folio must be clean for + * migration. Writeout may mean we lose the lock and the + * folio state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. + */ + remove_migration_ptes(folio, folio, false); + + rc = mapping->a_ops->writepage(&folio->page, &wbc); + + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + folio_lock(folio); + + return (rc < 0) ? -EIO : -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + if (folio_test_dirty(src)) { + /* Only writeback folios in full synchronous migration */ + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: + return -EBUSY; + } + return writeout(mapping, src); + } + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!filemap_release_folio(src, GFP_KERNEL)) + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; + + return migrate_folio(mapping, dst, src, mode); +} + +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + * + * Return value: + * < 0 - error code + * MIGRATEPAGE_SUCCESS - success + */ +static int move_to_new_folio(struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + int rc = -EAGAIN; + bool is_lru = !__PageMovable(&src->page); + + VM_BUG_ON_FOLIO(!folio_test_locked(src), src); + VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); + + if (likely(is_lru)) { + struct address_space *mapping = folio_mapping(src); + + if (!mapping) + rc = migrate_folio(mapping, dst, src, mode); + else if (mapping->a_ops->migrate_folio) + /* + * Most folios have a mapping and most filesystems + * provide a migrate_folio callback. Anonymous folios + * are part of swap space which also has its own + * migrate_folio callback. This is the most common path + * for page migration. + */ + rc = mapping->a_ops->migrate_folio(mapping, dst, src, + mode); + else + rc = fallback_migrate_folio(mapping, dst, src, mode); + } else { + const struct movable_operations *mops; + + /* + * In case of non-lru page, it could be released after + * isolation step. In that case, we shouldn't try migration. + */ + VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); + if (!folio_test_movable(src)) { + rc = MIGRATEPAGE_SUCCESS; + folio_clear_isolated(src); + goto out; + } + + mops = page_movable_ops(&src->page); + rc = mops->migrate_page(&dst->page, &src->page, mode); + WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && + !folio_test_isolated(src)); + } + + /* + * When successful, old pagecache src->mapping must be cleared before + * src is freed; but stats require that PageAnon be left as PageAnon. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + if (__PageMovable(&src->page)) { + VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); + + /* + * We clear PG_movable under page_lock so any compactor + * cannot try to migrate this page. + */ + folio_clear_isolated(src); + } + + /* + * Anonymous and movable src->mapping will be cleared by + * free_pages_prepare so don't reset it here for keeping + * the type to work PageAnon, for example. + */ + if (!folio_mapping_flags(src)) + src->mapping = NULL; + + if (likely(!folio_is_zone_device(dst))) + flush_dcache_folio(dst); + } +out: + return rc; +} + +static int __unmap_and_move(struct folio *src, struct folio *dst, + int force, enum migrate_mode mode) +{ + int rc = -EAGAIN; + bool page_was_mapped = false; + struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(&src->page); + + if (!folio_trylock(src)) { + if (!force || mode == MIGRATE_ASYNC) + goto out; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readahead). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto out; + + folio_lock(src); + } + + if (folio_test_writeback(src)) { + /* + * Only in the case of a full synchronous migration is it + * necessary to wait for PageWriteback. In the async case, + * the retry loop is too short and in the sync-light case, + * the overhead of stalling is too much + */ + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: + rc = -EBUSY; + goto out_unlock; + } + if (!force) + goto out_unlock; + folio_wait_writeback(src); + } + + /* + * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrate a page. + * This get_anon_vma() delays freeing anon_vma pointer until the end + * of migration. File cache pages are no problem because of page_lock() + * File Caches may use write_page() or lock_page() in migration, then, + * just care Anon page here. + * + * Only folio_get_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + * But if we cannot get anon_vma, then we won't need it anyway, + * because that implies that the anon page is no longer mapped + * (and cannot be remapped so long as we hold the page lock). + */ + if (folio_test_anon(src) && !folio_test_ksm(src)) + anon_vma = folio_get_anon_vma(src); + + /* + * Block others from accessing the new page when we get around to + * establishing additional references. We are usually the only one + * holding a reference to dst at this point. We used to have a BUG + * here if folio_trylock(dst) fails, but would like to allow for + * cases where there might be a race with the previous use of dst. + * This is much like races on refcount of oldpage: just don't BUG(). + */ + if (unlikely(!folio_trylock(dst))) + goto out_unlock; + + if (unlikely(!is_lru)) { + rc = move_to_new_folio(dst, src, mode); + goto out_unlock_both; + } + + /* + * Corner case handling: + * 1. When a new swap-cache page is read into, it is added to the LRU + * and treated as swapcache but it has no rmap yet. + * Calling try_to_unmap() against a src->mapping==NULL page will + * trigger a BUG. So handle it here. + * 2. An orphaned page (see truncate_cleanup_page) might have + * fs-private metadata. The page can be picked up due to memory + * offlining. Everywhere else except page reclaim, the page is + * invisible to the vm, so the page can not be migrated. So try to + * free the metadata, so the page can be freed. + */ + if (!src->mapping) { + if (folio_test_private(src)) { + try_to_free_buffers(src); + goto out_unlock_both; + } + } else if (folio_mapped(src)) { + /* Establish migration ptes */ + VM_BUG_ON_FOLIO(folio_test_anon(src) && + !folio_test_ksm(src) && !anon_vma, src); + try_to_migrate(src, 0); + page_was_mapped = true; + } + + if (!folio_mapped(src)) + rc = move_to_new_folio(dst, src, mode); + + /* + * When successful, push dst to LRU immediately: so that if it + * turns out to be an mlocked page, remove_migration_ptes() will + * automatically build up the correct dst->mlock_count for it. + * + * We would like to do something similar for the old page, when + * unsuccessful, and other cases when a page has been temporarily + * isolated from the unevictable LRU: but this case is the easiest. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + folio_add_lru(dst); + if (page_was_mapped) + lru_add_drain(); + } + + if (page_was_mapped) + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + +out_unlock_both: + folio_unlock(dst); +out_unlock: + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); +out: + /* + * If migration is successful, decrease refcount of dst, + * which will not free the page because new page owner increased + * refcounter. + */ + if (rc == MIGRATEPAGE_SUCCESS) + folio_put(dst); + + return rc; +} + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, + unsigned long private, struct page *page, + int force, enum migrate_mode mode, + enum migrate_reason reason, + struct list_head *ret) +{ + struct folio *dst, *src = page_folio(page); + int rc = MIGRATEPAGE_SUCCESS; + struct page *newpage = NULL; + + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOSYS; + + if (page_count(page) == 1) { + /* Page was freed from under us. So we are done. */ + ClearPageActive(page); + ClearPageUnevictable(page); + /* free_pages_prepare() will clear PG_isolated. */ + goto out; + } + + newpage = get_new_page(page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); + + newpage->private = 0; + rc = __unmap_and_move(src, dst, force, mode); + if (rc == MIGRATEPAGE_SUCCESS) + set_page_owner_migrate_reason(newpage, reason); + +out: + if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kept its references and be restored. + */ + list_del(&page->lru); + } + + /* + * If migration is successful, releases reference grabbed during + * isolation. Otherwise, restore the page to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__PageMovable(page))) + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_lru(page), -thp_nr_pages(page)); + + if (reason != MR_MEMORY_FAILURE) + /* + * We release the page in page_handle_poison. + */ + put_page(page); + } else { + if (rc != -EAGAIN) + list_add_tail(&page->lru, ret); + + if (put_new_page) + put_new_page(newpage, private); + else + put_page(newpage); + } + + return rc; +} + +/* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode, int reason, + struct list_head *ret) +{ + struct folio *dst, *src = page_folio(hpage); + int rc = -EAGAIN; + int page_was_mapped = 0; + struct page *new_hpage; + struct anon_vma *anon_vma = NULL; + struct address_space *mapping = NULL; + + /* + * Migratability of hugepages depends on architectures and their size. + * This check is necessary because some callers of hugepage migration + * like soft offline and memory hotremove don't walk through page + * tables or check whether the hugepage is pmd-based or not before + * kicking migration. + */ + if (!hugepage_migration_supported(page_hstate(hpage))) + return -ENOSYS; + + if (folio_ref_count(src) == 1) { + /* page was freed from under us. So we are done. */ + putback_active_hugepage(hpage); + return MIGRATEPAGE_SUCCESS; + } + + new_hpage = get_new_page(hpage, private); + if (!new_hpage) + return -ENOMEM; + dst = page_folio(new_hpage); + + if (!folio_trylock(src)) { + if (!force) + goto out; + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: + goto out; + } + folio_lock(src); + } + + /* + * Check for pages which are in the process of being freed. Without + * folio_mapping() set, hugetlbfs specific move page routine will not + * be called and we could leak usage counts for subpools. + */ + if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { + rc = -EBUSY; + goto out_unlock; + } + + if (folio_test_anon(src)) + anon_vma = folio_get_anon_vma(src); + + if (unlikely(!folio_trylock(dst))) + goto put_anon; + + if (folio_mapped(src)) { + enum ttu_flags ttu = 0; + + if (!folio_test_anon(src)) { + /* + * In shared mappings, try_to_unmap could potentially + * call huge_pmd_unshare. Because of this, take + * semaphore in write mode here and set TTU_RMAP_LOCKED + * to let lower levels know we have taken the lock. + */ + mapping = hugetlb_page_mapping_lock_write(hpage); + if (unlikely(!mapping)) + goto unlock_put_anon; + + ttu = TTU_RMAP_LOCKED; + } + + try_to_migrate(src, ttu); + page_was_mapped = 1; + + if (ttu & TTU_RMAP_LOCKED) + i_mmap_unlock_write(mapping); + } + + if (!folio_mapped(src)) + rc = move_to_new_folio(dst, src, mode); + + if (page_was_mapped) + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + +unlock_put_anon: + folio_unlock(dst); + +put_anon: + if (anon_vma) + put_anon_vma(anon_vma); + + if (rc == MIGRATEPAGE_SUCCESS) { + move_hugetlb_state(hpage, new_hpage, reason); + put_new_page = NULL; + } + +out_unlock: + folio_unlock(src); +out: + if (rc == MIGRATEPAGE_SUCCESS) + putback_active_hugepage(hpage); + else if (rc != -EAGAIN) + list_move_tail(&src->lru, ret); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (put_new_page) + put_new_page(new_hpage, private); + else + putback_active_hugepage(new_hpage); + + return rc; +} + +static inline int try_split_thp(struct page *page, struct list_head *split_pages) +{ + int rc; + + lock_page(page); + rc = split_huge_page_to_list(page, split_pages); + unlock_page(page); + if (!rc) + list_move_tail(&page->lru, split_pages); + + return rc; +} + +/* + * migrate_pages - migrate the pages specified in a list, to the free pages + * supplied as the target for the page migration + * + * @from: The list of pages to be migrated. + * @get_new_page: The function used to allocate free pages to be used + * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for + * page migration, if any. + * @reason: The reason for page migration. + * @ret_succeeded: Set to the number of normal pages migrated successfully if + * the caller passes a non-NULL pointer. + * + * The function returns after 10 attempts or if no pages are movable any more + * because the list has become empty or no retryable pages exist any more. + * It is caller's responsibility to call putback_movable_pages() to return pages + * to the LRU or free list only if ret != 0. + * + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. + */ +int migrate_pages(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) +{ + int retry = 1; + int thp_retry = 1; + int nr_failed = 0; + int nr_failed_pages = 0; + int nr_retry_pages = 0; + int nr_succeeded = 0; + int nr_thp_succeeded = 0; + int nr_thp_failed = 0; + int nr_thp_split = 0; + int pass = 0; + bool is_thp = false; + struct page *page; + struct page *page2; + int rc, nr_subpages; + LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); + bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false; + + trace_mm_migrate_pages_start(mode, reason); + +thp_subpage_migration: + for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { + retry = 0; + thp_retry = 0; + nr_retry_pages = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + /* + * THP statistics is based on the source huge page. + * Capture required information that might get lost + * during migration. + */ + is_thp = PageTransHuge(page) && !PageHuge(page); + nr_subpages = compound_nr(page); + cond_resched(); + + if (PageHuge(page)) + rc = unmap_and_move_huge_page(get_new_page, + put_new_page, private, page, + pass > 2, mode, reason, + &ret_pages); + else + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode, + reason, &ret_pages); + /* + * The rules are: + * Success: non hugetlb page will be freed, hugetlb + * page will be put back + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list + * Other errno: put on ret_pages list then splice to + * from list + */ + switch(rc) { + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Sub-pages are put in thp_split_pages, and + * we will migrate them after the rest of the + * list is processed. + */ + case -ENOSYS: + /* THP migration is unsupported */ + if (is_thp) { + nr_thp_failed++; + if (!try_split_thp(page, &thp_split_pages)) { + nr_thp_split++; + break; + } + /* Hugetlb migration is unsupported */ + } else if (!no_subpage_counting) { + nr_failed++; + } + + nr_failed_pages += nr_subpages; + list_move_tail(&page->lru, &ret_pages); + break; + case -ENOMEM: + /* + * When memory is low, don't bother to try to migrate + * other pages, just exit. + */ + if (is_thp) { + nr_thp_failed++; + /* THP NUMA faulting doesn't split THP to retry. */ + if (!nosplit && !try_split_thp(page, &thp_split_pages)) { + nr_thp_split++; + break; + } + } else if (!no_subpage_counting) { + nr_failed++; + } + + nr_failed_pages += nr_subpages + nr_retry_pages; + /* + * There might be some subpages of fail-to-migrate THPs + * left in thp_split_pages list. Move them back to migration + * list so that they could be put back to the right list by + * the caller otherwise the page refcnt will be leaked. + */ + list_splice_init(&thp_split_pages, from); + /* nr_failed isn't updated for not used */ + nr_thp_failed += thp_retry; + goto out; + case -EAGAIN: + if (is_thp) + thp_retry++; + else if (!no_subpage_counting) + retry++; + nr_retry_pages += nr_subpages; + break; + case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; + if (is_thp) + nr_thp_succeeded++; + break; + default: + /* + * Permanent failure (-EBUSY, etc.): + * unlike -EAGAIN case, the failed page is + * removed from migration page list and not + * retried in the next outer loop. + */ + if (is_thp) + nr_thp_failed++; + else if (!no_subpage_counting) + nr_failed++; + + nr_failed_pages += nr_subpages; + break; + } + } + } + nr_failed += retry; + nr_thp_failed += thp_retry; + nr_failed_pages += nr_retry_pages; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; +out: + /* + * Put the permanent failure page back to migration list, they + * will be put back to the right list by the caller. + */ + list_splice(&ret_pages, from); + + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); + count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); + count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); + count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, + nr_thp_failed, nr_thp_split, mode, reason); + + if (ret_succeeded) + *ret_succeeded = nr_succeeded; + + return rc; +} + +struct page *alloc_migration_target(struct page *page, unsigned long private) +{ + struct folio *folio = page_folio(page); + struct migration_target_control *mtc; + gfp_t gfp_mask; + unsigned int order = 0; + struct folio *new_folio = NULL; + int nid; + int zidx; + + mtc = (struct migration_target_control *)private; + gfp_mask = mtc->gfp_mask; + nid = mtc->nid; + if (nid == NUMA_NO_NODE) + nid = folio_nid(folio); + + if (folio_test_hugetlb(folio)) { + struct hstate *h = page_hstate(&folio->page); + + gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); + return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); + } + + if (folio_test_large(folio)) { + /* + * clear __GFP_RECLAIM to make the migration callback + * consistent with regular THP allocations. + */ + gfp_mask &= ~__GFP_RECLAIM; + gfp_mask |= GFP_TRANSHUGE; + order = folio_order(folio); + } + zidx = zone_idx(folio_zone(folio)); + if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) + gfp_mask |= __GFP_HIGHMEM; + + new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask); + + return &new_folio->page; +} + +#ifdef CONFIG_NUMA + +static int store_status(int __user *status, int start, int value, int nr) +{ + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } + + return 0; +} + +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; + struct migration_target_control mtc = { + .nid = node, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; + + err = migrate_pages(pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + if (err) + putback_movable_pages(pagelist); + return err; +} + +/* + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns: + * errno - if the page cannot be found/isolated + * 0 - when it doesn't have to be migrated because it is already on the + * target node + * 1 - when it has been queued + */ +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) +{ + struct vm_area_struct *vma; + struct page *page; + int err; + + mmap_read_lock(mm); + err = -EFAULT; + vma = vma_lookup(mm, addr); + if (!vma || !vma_migratable(vma)) + goto out; + + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + + err = -ENOENT; + if (!page) + goto out; + + if (is_zone_device_page(page)) + goto out_putpage; + + err = 0; + if (page_to_nid(page) == node) + goto out_putpage; + + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; + + if (PageHuge(page)) { + if (PageHead(page)) { + err = isolate_hugetlb(page, pagelist); + if (!err) + err = 1; + } + } else { + struct page *head; + + head = compound_head(page); + err = isolate_lru_page(head); + if (err) + goto out_putpage; + + err = 1; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_lru(head), + thp_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: + mmap_read_unlock(mm); + return err; +} + +static int move_pages_and_store_status(struct mm_struct *mm, int node, + struct list_head *pagelist, int __user *status, + int start, int i, unsigned long nr_pages) +{ + int err; + + if (list_empty(pagelist)) + return 0; + + err = do_move_pages_to_node(mm, pagelist, node); + if (err) { + /* + * Positive err means the number of failed + * pages to migrate. Since we are going to + * abort and return the number of non-migrated + * pages, so need to include the rest of the + * nr_pages that have not been attempted as + * well. + */ + if (err > 0) + err += nr_pages - i; + return err; + } + return store_status(status, start, node, i - start); +} + +/* + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. + */ +static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + compat_uptr_t __user *compat_pages = (void __user *)pages; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; + + lru_cache_disable(); + + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; + + err = -EFAULT; + if (in_compat_syscall()) { + compat_uptr_t cp; + + if (get_user(cp, compat_pages + i)) + goto out_flush; + + p = compat_ptr(cp); + } else { + if (get_user(p, pages + i)) + goto out_flush; + } + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)untagged_addr(p); + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; + + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = move_pages_and_store_status(mm, current_node, + &pagelist, status, start, i, nr_pages); + if (err) + goto out; + start = i; + current_node = node; + } + + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + + if (err > 0) { + /* The page is successfully queued for migration */ + continue; + } + + /* + * The move_pages() man page does not have an -EEXIST choice, so + * use -EFAULT instead. + */ + if (err == -EEXIST) + err = -EFAULT; + + /* + * If the page is already on the target node (!err), store the + * node, otherwise, store the err. + */ + err = store_status(status, i, err ? : current_node, 1); + if (err) + goto out_flush; + + err = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); + if (err) { + /* We have accounted for page i */ + if (err > 0) + err--; + goto out; + } + current_node = NUMA_NO_NODE; + } +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); + if (err >= 0) + err = err1; +out: + lru_cache_enable(); + return err; +} + +/* + * Determine the nodes of an array of pages and store it in an array of status. + */ +static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, + const void __user **pages, int *status) +{ + unsigned long i; + + mmap_read_lock(mm); + + for (i = 0; i < nr_pages; i++) { + unsigned long addr = (unsigned long)(*pages); + unsigned int foll_flags = FOLL_DUMP; + struct vm_area_struct *vma; + struct page *page; + int err = -EFAULT; + + vma = vma_lookup(mm, addr); + if (!vma) + goto set_status; + + /* Not all huge page follow APIs support 'FOLL_GET' */ + if (!is_vm_hugetlb_page(vma)) + foll_flags |= FOLL_GET; + + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, foll_flags); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + if (!page) + goto set_status; + + if (!is_zone_device_page(page)) + err = page_to_nid(page); + + if (foll_flags & FOLL_GET) + put_page(page); +set_status: + *status = err; + + pages++; + status++; + } + + mmap_read_unlock(mm); +} + +static int get_compat_pages_array(const void __user *chunk_pages[], + const void __user * __user *pages, + unsigned long chunk_nr) +{ + compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; + compat_uptr_t p; + int i; + + for (i = 0; i < chunk_nr; i++) { + if (get_user(p, pages32 + i)) + return -EFAULT; + chunk_pages[i] = compat_ptr(p); + } + + return 0; +} + +/* + * Determine the nodes of a user array of pages and store it in + * a user array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ +#define DO_PAGES_STAT_CHUNK_NR 16UL + const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; + int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + + while (nr_pages) { + unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); + + if (in_compat_syscall()) { + if (get_compat_pages_array(chunk_pages, pages, + chunk_nr)) + break; + } else { + if (copy_from_user(chunk_pages, pages, + chunk_nr * sizeof(*chunk_pages))) + break; + } + + do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); + + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; + + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; +} + +static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) +{ + struct task_struct *task; + struct mm_struct *mm; + + /* + * There is no need to check if current process has the right to modify + * the specified process when they are same. + */ + if (!pid) { + mmget(current->mm); + *mem_nodes = cpuset_mems_allowed(current); + return current->mm; + } + + /* Find the mm_struct */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return ERR_PTR(-ESRCH); + } + get_task_struct(task); + + /* + * Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + mm = ERR_PTR(-EPERM); + goto out; + } + rcu_read_unlock(); + + mm = ERR_PTR(security_task_movememory(task)); + if (IS_ERR(mm)) + goto out; + *mem_nodes = cpuset_mems_allowed(task); + mm = get_task_mm(task); +out: + put_task_struct(task); + if (!mm) + mm = ERR_PTR(-EINVAL); + return mm; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +static int kernel_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + struct mm_struct *mm; + int err; + nodemask_t task_nodes; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + mm = find_mm_struct(pid, &task_nodes); + if (IS_ERR(mm)) + return PTR_ERR(mm); + + if (nodes) + err = do_pages_move(mm, task_nodes, nr_pages, pages, + nodes, status, flags); + else + err = do_pages_stat(mm, nr_pages, pages, status); + + mmput(mm); + return err; +} + +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) +{ + return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); +} + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which is crude. + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, + unsigned long nr_migrate_pages) +{ + int z; + + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!managed_zone(zone)) + continue; + + /* Avoid waking kswapd by allocating pages_to_migrate pages. */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + + nr_migrate_pages, + ZONE_MOVABLE, 0)) + continue; + return true; + } + return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, + unsigned long data) +{ + int nid = (int) data; + int order = compound_order(page); + gfp_t gfp = __GFP_THISNODE; + struct folio *new; + + if (order > 0) + gfp |= GFP_TRANSHUGE_LIGHT; + else { + gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN; + gfp &= ~__GFP_RECLAIM; + } + new = __folio_alloc_node(gfp, order, nid); + + return &new->page; +} + +static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int nr_pages = thp_nr_pages(page); + int order = compound_order(page); + + VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); + + /* Do not migrate THP mapped by multiple processes */ + if (PageTransHuge(page) && total_mapcount(page) > 1) + return 0; + + /* Avoid migrating to a node that is nearly full */ + if (!migrate_balanced_pgdat(pgdat, nr_pages)) { + int z; + + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) + return 0; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + if (managed_zone(pgdat->node_zones + z)) + break; + } + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); + return 0; + } + + if (isolate_lru_page(page)) + return 0; + + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), + nr_pages); + + /* + * Isolating the page has taken another reference, so the + * caller's reference can be safely dropped without the page + * disappearing underneath us during migration. + */ + put_page(page); + return 1; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated; + int nr_remaining; + unsigned int nr_succeeded; + LIST_HEAD(migratepages); + int nr_pages = thp_nr_pages(page); + + /* + * Don't migrate file pages that are mapped in multiple processes + * with execute permissions as they are probably shared libraries. + */ + if (page_mapcount(page) != 1 && page_is_file_lru(page) && + (vma->vm_flags & VM_EXEC)) + goto out; + + /* + * Also do not migrate dirty pages as not all filesystems can move + * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + */ + if (page_is_file_lru(page) && PageDirty(page)) + goto out; + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED, &nr_succeeded); + if (nr_remaining) { + if (!list_empty(&migratepages)) { + list_del(&page->lru); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_lru(page), -nr_pages); + putback_lru_page(page); + } + isolated = 0; + } + if (nr_succeeded) { + count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); + if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) + mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, + nr_succeeded); + } + BUG_ON(!list_empty(&migratepages)); + return isolated; + +out: + put_page(page); + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_NUMA */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c new file mode 100644 index 000000000..721b2365d --- /dev/null +++ b/mm/migrate_device.c @@ -0,0 +1,975 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Device Memory Migration functionality. + * + * Originally written by Jérôme Glisse. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + __always_unused int depth, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, -1, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn = 0, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + + if (pte_none(pte)) { + if (vma_is_anonymous(vma)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + } + goto next; + } + + if (!pte_present(pte)) { + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = pfn_swap_entry_to_page(entry); + if (!(migrate->flags & + MIGRATE_VMA_SELECT_DEVICE_PRIVATE) || + page->pgmap->owner != migrate->pgmap_owner) + goto next; + + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; + if (is_writable_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + pfn = pte_pfn(pte); + if (is_zero_pfn(pfn) && + (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + goto next; + } + page = vm_normal_page(migrate->vma, addr, pte); + if (page && !is_zone_device_page(page) && + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + else if (page && is_device_coherent_page(page) && + (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + page->pgmap->owner != migrate->pgmap_owner)) + goto next; + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = 0; + goto next; + } + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + + /* + * We rely on trylock_page() to avoid deadlock between + * concurrent migrations where each is waiting on the others + * page lock. If we can't immediately lock the page we fail this + * migration as it is only best effort anyway. + * + * If we can lock the page it's safe to set up a migration entry + * now. In the common case where the page is mapped once in a + * single process setting up the migration entry now is an + * optimisation to avoid walking the rmap later with + * try_to_migrate(). + */ + if (trylock_page(page)) { + bool anon_exclusive; + pte_t swp_pte; + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (anon_exclusive) { + pte = ptep_clear_flush(vma, addr, ptep); + + if (page_try_share_anon_rmap(page)) { + set_pte_at(mm, addr, ptep, pte); + unlock_page(page); + put_page(page); + mpfn = 0; + goto next; + } + } else { + pte = ptep_get_and_clear(mm, addr, ptep); + } + + migrate->cpages++; + + /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pte)) + folio_mark_dirty(page_folio(page)); + + /* Setup special migration page table entry */ + if (mpfn & MIGRATE_PFN_WRITE) + entry = make_writable_migration_entry( + page_to_pfn(page)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry( + page_to_pfn(page)); + else + entry = make_readable_migration_entry( + page_to_pfn(page)); + if (pte_present(pte)) { + if (pte_young(pte)) + entry = make_migration_entry_young(entry); + if (pte_dirty(pte)) + entry = make_migration_entry_dirty(entry); + } + swp_pte = swp_entry_to_pte(entry); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, vma, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } else { + put_page(page); + mpfn = 0; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + return 0; +} + +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mmu_notifier_range range; + + /* + * Note that the pgmap_owner is passed to the mmu notifier callback so + * that the registered device driver can skip invalidating device + * private page mappings that won't be migrated. + */ + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0, + migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * folio_migrate_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1 + (page == fault_page); + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) + extra++; + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * Unmaps pages for migration. Returns number of source pfns marked as + * migrating. + */ +static unsigned long migrate_device_unmap(unsigned long *src_pfns, + unsigned long npages, + struct page *fault_page) +{ + unsigned long i, restore = 0; + bool allow_drain = true; + unsigned long unmapped = 0; + + lru_add_drain(); + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(src_pfns[i]); + struct folio *folio; + + if (!page) { + if (src_pfns[i] & MIGRATE_PFN_MIGRATE) + unmapped++; + continue; + } + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + restore++; + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + folio = page_folio(page); + if (folio_mapped(folio)) + try_to_migrate(folio, 0); + + if (page_mapped(page) || + !migrate_vma_check_page(page, fault_page)) { + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + restore++; + continue; + } + + unmapped++; + } + + for (i = 0; i < npages && restore; i++) { + struct page *page = migrate_pfn_to_page(src_pfns[i]); + struct folio *folio; + + if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + + folio = page_folio(page); + remove_migration_ptes(folio, folio, false); + + src_pfns[i] = 0; + folio_unlock(folio); + folio_put(folio); + restore--; + } + + return unmapped; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, + migrate->fault_page); +} + +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with MIGRATE_PFN_VALID. Destination pages must be locked via + * lock_page(). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_lock is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + +/* + * This code closely matches the code in: + * __handle_mm_fault() + * handle_pte_fault() + * do_anonymous_page() + * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE + * private or coherent page. + */ +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when + * parallel threads are excluded by other means. + * + * Here we only have mmap_read_lock(mm). + */ + if (pte_alloc(mm, pmdp)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + if (vma->vm_flags & VM_WRITE) + swp_entry = make_writable_device_private_entry( + page_to_pfn(page)); + else + swp_entry = make_readable_device_private_entry( + page_to_pfn(page)); + entry = swp_entry_to_pte(swp_entry); + } else { + if (is_zone_device_page(page) && + !is_device_coherent_page(page)) { + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; + } + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (check_stable_address_space(mm)) + goto unlock_abort; + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) + goto unlock_abort; + flush = true; + } else if (!pte_none(*ptep)) + goto unlock_abort; + + /* + * Check for userfaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) + goto unlock_abort; + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + if (!is_zone_device_page(page)) + lru_cache_add_inactive_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +unlock_abort: + pte_unmap_unlock(ptep, ptl); +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +static void __migrate_device_pages(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages, + struct migrate_vma *migrate) +{ + struct mmu_notifier_range range; + unsigned long i; + bool notified = false; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + unsigned long addr; + + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + + /* + * The only time there is no vma is when called from + * migrate_device_coherent_page(). However this isn't + * called if the page could not be unmapped. + */ + VM_BUG_ON(!migrate); + addr = migrate->start + i*PAGE_SIZE; + if (!notified) { + notified = true; + + mmu_notifier_range_init_owner(&range, + MMU_NOTIFY_MIGRATE, 0, migrate->vma, + migrate->vma->vm_mm, addr, migrate->end, + migrate->pgmap_owner); + mmu_notifier_invalidate_range_start(&range); + } + migrate_vma_insert_page(migrate, addr, newpage, + &src_pfns[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_device_private_page(newpage) || + is_device_coherent_page(newpage)) { + /* + * For now only support anonymous memory migrating to + * device private or coherent memory. + */ + if (mapping) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (is_zone_device_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not supported. + */ + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (migrate && migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + } + + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ + if (notified) + mmu_notifier_invalidate_range_only_end(&range); +} + +/** + * migrate_device_pages() - migrate meta-data from src page to dst page + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Equivalent to migrate_vma_pages(). This is called to migrate struct page + * meta-data from source struct page to destination. + */ +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages) +{ + __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); +} +EXPORT_SYMBOL(migrate_device_pages); + +/** + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +void migrate_vma_pages(struct migrate_vma *migrate) +{ + __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); +} +EXPORT_SYMBOL(migrate_vma_pages); + +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) +{ + unsigned long i; + + for (i = 0; i < npages; i++) { + struct folio *dst, *src; + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + src = page_folio(page); + dst = page_folio(newpage); + remove_migration_ptes(src, dst, false); + folio_unlock(src); + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} +EXPORT_SYMBOL(migrate_device_finalize); + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); +} +EXPORT_SYMBOL(migrate_vma_finalize); + +/** + * migrate_device_range() - migrate device private pfns to normal memory. + * @src_pfns: array large enough to hold migrating source device private pfns. + * @start: starting pfn in the range to migrate. + * @npages: number of pages to migrate. + * + * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that + * instead of looking up pages based on virtual address mappings a range of + * device pfns that should be migrated to system memory is used instead. + * + * This is useful when a driver needs to free device memory but doesn't know the + * virtual mappings of every page that may be in device memory. For example this + * is often the case when a driver is being unloaded or unbound from a device. + * + * Like migrate_vma_setup() this function will take a reference and lock any + * migrating pages that aren't free before unmapping them. Drivers may then + * allocate destination pages and start copying data from the device to CPU + * memory before calling migrate_device_pages(). + */ +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages) +{ + unsigned long i, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + if (!get_page_unless_zero(page)) { + src_pfns[i] = 0; + continue; + } + + if (!trylock_page(page)) { + src_pfns[i] = 0; + put_page(page); + continue; + } + + src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + } + + migrate_device_unmap(src_pfns, npages, NULL); + + return 0; +} +EXPORT_SYMBOL(migrate_device_range); + +/* + * Migrate a device coherent page back to normal memory. The caller should have + * a reference on page which will be copied to the new page if migration is + * successful or dropped on failure. + */ +int migrate_device_coherent_page(struct page *page) +{ + unsigned long src_pfn, dst_pfn = 0; + struct page *dpage; + + WARN_ON_ONCE(PageCompound(page)); + + lock_page(page); + src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + + /* + * We don't have a VMA and don't need to walk the page tables to find + * the source page. So call migrate_vma_unmap() directly to unmap the + * page as migrate_vma_setup() will fail if args.vma == NULL. + */ + migrate_device_unmap(&src_pfn, 1, NULL); + if (!(src_pfn & MIGRATE_PFN_MIGRATE)) + return -EBUSY; + + dpage = alloc_page(GFP_USER | __GFP_NOWARN); + if (dpage) { + lock_page(dpage); + dst_pfn = migrate_pfn(page_to_pfn(dpage)); + } + + migrate_device_pages(&src_pfn, &dst_pfn, 1); + if (src_pfn & MIGRATE_PFN_MIGRATE) + copy_highpage(dpage, page); + migrate_device_finalize(&src_pfn, &dst_pfn, 1); + + if (src_pfn & MIGRATE_PFN_MIGRATE) + return 0; + return -EBUSY; +} diff --git a/mm/mincore.c b/mm/mincore.c new file mode 100644 index 000000000..1eb6aac88 --- /dev/null +++ b/mm/mincore.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/mincore.c + * + * Copyright (C) 1994-2006 Linus Torvalds + */ + +/* + * The mincore() system call. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "swap.h" + +static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned char present; + unsigned char *vec = walk->private; + + /* + * Hugepages under user process are always in RAM and never + * swapped out, but theoretically it needs to be checked. + */ + present = pte && !huge_pte_none_mostly(huge_ptep_get(pte)); + for (; addr != end; vec++, addr += PAGE_SIZE) + *vec = present; + walk->private = vec; +#else + BUG(); +#endif + return 0; +} + +/* + * Later we can get more picky about what "in core" means precisely. + * For now, simply check to see if the page is in the page cache, + * and is up to date; i.e. that no page-in operation would be required + * at this time if an application were to map and access this page. + */ +static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) +{ + unsigned char present = 0; + struct page *page; + + /* + * When tmpfs swaps out a page from a file, any process mapping that + * file will not get a swp_entry_t in its pte, but rather it is like + * any other file mapping (ie. marked !present and faulted in with + * tmpfs's .fault). So swapped out tmpfs mappings are tested here. + */ + page = find_get_incore_page(mapping, index); + if (page) { + present = PageUptodate(page); + put_page(page); + } + + return present; +} + +static int __mincore_unmapped_range(unsigned long addr, unsigned long end, + struct vm_area_struct *vma, unsigned char *vec) +{ + unsigned long nr = (end - addr) >> PAGE_SHIFT; + int i; + + if (vma->vm_file) { + pgoff_t pgoff; + + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; + } + return nr; +} + +static int mincore_unmapped_range(unsigned long addr, unsigned long end, + __always_unused int depth, + struct mm_walk *walk) +{ + walk->private += __mincore_unmapped_range(addr, end, + walk->vma, walk->private); + return 0; +} + +static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + spinlock_t *ptl; + struct vm_area_struct *vma = walk->vma; + pte_t *ptep; + unsigned char *vec = walk->private; + int nr = (end - addr) >> PAGE_SHIFT; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + memset(vec, 1, nr); + spin_unlock(ptl); + goto out; + } + + if (pmd_trans_unstable(pmd)) { + __mincore_unmapped_range(addr, end, vma, vec); + goto out; + } + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr != end; ptep++, addr += PAGE_SIZE) { + pte_t pte = *ptep; + + /* We need to do cache lookup too for pte markers */ + if (pte_none_mostly(pte)) + __mincore_unmapped_range(addr, addr + PAGE_SIZE, + vma, vec); + else if (pte_present(pte)) + *vec = 1; + else { /* pte is a swap entry */ + swp_entry_t entry = pte_to_swp_entry(pte); + + if (non_swap_entry(entry)) { + /* + * migration or hwpoison entries are always + * uptodate + */ + *vec = 1; + } else { +#ifdef CONFIG_SWAP + *vec = mincore_page(swap_address_space(entry), + swp_offset(entry)); +#else + WARN_ON(1); + *vec = 1; +#endif + } + } + vec++; + } + pte_unmap_unlock(ptep - 1, ptl); +out: + walk->private += nr; + cond_resched(); + return 0; +} + +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * Reveal pagecache information only for non-anonymous mappings that + * correspond to the files the calling process could (if tried) open + * for writing; otherwise we'd be including shared non-exclusive + * mappings, which opens a side channel. + */ + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; +} + +static const struct mm_walk_ops mincore_walk_ops = { + .pmd_entry = mincore_pte_range, + .pte_hole = mincore_unmapped_range, + .hugetlb_entry = mincore_hugetlb, +}; + +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) +{ + struct vm_area_struct *vma; + unsigned long end; + int err; + + vma = find_vma(current->mm, addr); + if (!vma || addr < vma->vm_start) + return -ENOMEM; + end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); + memset(vec, 1, pages); + return pages; + } + err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); + if (err < 0) + return err; + return (end - addr) >> PAGE_SHIFT; +} + +/* + * The mincore(2) system call. + * + * mincore() returns the memory residency status of the pages in the + * current process's address space specified by [addr, addr + len). + * The status is returned in a vector of bytes. The least significant + * bit of each byte is 1 if the referenced page is in memory, otherwise + * it is zero. + * + * Because the status of a page can change after mincore() checks it + * but before it returns to the application, the returned vector may + * contain stale information. Only locked pages are guaranteed to + * remain in memory. + * + * return values: + * zero - success + * -EFAULT - vec points to an illegal address + * -EINVAL - addr is not a multiple of PAGE_SIZE + * -ENOMEM - Addresses in the range [addr, addr + len] are + * invalid for the address space of this process, or + * specify one or more pages which are not currently + * mapped + * -EAGAIN - A kernel resource was temporarily unavailable. + */ +SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, + unsigned char __user *, vec) +{ + long retval; + unsigned long pages; + unsigned char *tmp; + + start = untagged_addr(start); + + /* Check the start address: needs to be page-aligned.. */ + if (start & ~PAGE_MASK) + return -EINVAL; + + /* ..and we need to be passed a valid user-space range */ + if (!access_ok((void __user *) start, len)) + return -ENOMEM; + + /* This also avoids any overflows on PAGE_ALIGN */ + pages = len >> PAGE_SHIFT; + pages += (offset_in_page(len)) != 0; + + if (!access_ok(vec, pages)) + return -EFAULT; + + tmp = (void *) __get_free_page(GFP_USER); + if (!tmp) + return -EAGAIN; + + retval = 0; + while (pages) { + /* + * Do at most PAGE_SIZE entries per iteration, due to + * the temporary buffer size. + */ + mmap_read_lock(current->mm); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); + mmap_read_unlock(current->mm); + + if (retval <= 0) + break; + if (copy_to_user(vec, tmp, retval)) { + retval = -EFAULT; + break; + } + pages -= retval; + vec += retval; + start += retval << PAGE_SHIFT; + retval = 0; + } + free_page((unsigned long) tmp); + return retval; +} diff --git a/mm/mlock.c b/mm/mlock.c new file mode 100644 index 000000000..7032f6dd0 --- /dev/null +++ b/mm/mlock.c @@ -0,0 +1,777 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/mlock.c + * + * (C) Copyright 1995 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +struct mlock_pvec { + local_lock_t lock; + struct pagevec vec; +}; + +static DEFINE_PER_CPU(struct mlock_pvec, mlock_pvec) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +bool can_do_mlock(void) +{ + if (rlimit(RLIMIT_MEMLOCK) != 0) + return true; + if (capable(CAP_IPC_LOCK)) + return true; + return false; +} +EXPORT_SYMBOL(can_do_mlock); + +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + */ + +static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec) +{ + /* There is nothing more we can do while it's off LRU */ + if (!TestClearPageLRU(page)) + return lruvec; + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + if (unlikely(page_evictable(page))) { + /* + * This is a little surprising, but quite possible: + * PageMlocked must have got cleared already by another CPU. + * Could this page be on the Unevictable LRU? I'm not sure, + * but move it now if so. + */ + if (PageUnevictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, + thp_nr_pages(page)); + } + goto out; + } + + if (PageUnevictable(page)) { + if (PageMlocked(page)) + page->mlock_count++; + goto out; + } + + del_page_from_lru_list(page, lruvec); + ClearPageActive(page); + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + /* As above, this is a little surprising, but possible */ + if (unlikely(page_evictable(page))) + goto out; + + SetPageUnevictable(page); + page->mlock_count = !!PageMlocked(page); + __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page)); +out: + add_page_to_lru_list(page, lruvec); + SetPageLRU(page); + return lruvec; +} + +static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec) +{ + int nr_pages = thp_nr_pages(page); + bool isolated = false; + + if (!TestClearPageLRU(page)) + goto munlock; + + isolated = true; + lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec); + + if (PageUnevictable(page)) { + /* Then mlock_count is maintained, but might undercount */ + if (page->mlock_count) + page->mlock_count--; + if (page->mlock_count) + goto out; + } + /* else assume that was the last mlock: reclaim will fix it if not */ + +munlock: + if (TestClearPageMlocked(page)) { + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + if (isolated || !PageUnevictable(page)) + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); + else + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); + } + + /* page_evictable() has to be checked *after* clearing Mlocked */ + if (isolated && PageUnevictable(page) && page_evictable(page)) { + del_page_from_lru_list(page, lruvec); + ClearPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } +out: + if (isolated) + SetPageLRU(page); + return lruvec; +} + +/* + * Flags held in the low bits of a struct page pointer on the mlock_pvec. + */ +#define LRU_PAGE 0x1 +#define NEW_PAGE 0x2 +static inline struct page *mlock_lru(struct page *page) +{ + return (struct page *)((unsigned long)page + LRU_PAGE); +} + +static inline struct page *mlock_new(struct page *page) +{ + return (struct page *)((unsigned long)page + NEW_PAGE); +} + +/* + * mlock_pagevec() is derived from pagevec_lru_move_fn(): + * perhaps that can make use of such page pointer flags in future, + * but for now just keep it for mlock. We could use three separate + * pagevecs instead, but one feels better (munlocking a full pagevec + * does not need to drain mlocking pagevecs first). + */ +static void mlock_pagevec(struct pagevec *pvec) +{ + struct lruvec *lruvec = NULL; + unsigned long mlock; + struct page *page; + int i; + + for (i = 0; i < pagevec_count(pvec); i++) { + page = pvec->pages[i]; + mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE); + page = (struct page *)((unsigned long)page - mlock); + pvec->pages[i] = page; + + if (mlock & LRU_PAGE) + lruvec = __mlock_page(page, lruvec); + else if (mlock & NEW_PAGE) + lruvec = __mlock_new_page(page, lruvec); + else + lruvec = __munlock_page(page, lruvec); + } + + if (lruvec) + unlock_page_lruvec_irq(lruvec); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + +void mlock_page_drain_local(void) +{ + struct pagevec *pvec; + + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); + local_unlock(&mlock_pvec.lock); +} + +void mlock_page_drain_remote(int cpu) +{ + struct pagevec *pvec; + + WARN_ON_ONCE(cpu_online(cpu)); + pvec = &per_cpu(mlock_pvec.vec, cpu); + if (pagevec_count(pvec)) + mlock_pagevec(pvec); +} + +bool need_mlock_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(mlock_pvec.vec, cpu)); +} + +/** + * mlock_folio - mlock a folio already on (or temporarily off) LRU + * @folio: folio to be mlocked. + */ +void mlock_folio(struct folio *folio) +{ + struct pagevec *pvec; + + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); + + if (!folio_test_set_mlocked(folio)) { + int nr_pages = folio_nr_pages(folio); + + zone_stat_mod_folio(folio, NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + } + + folio_get(folio); + if (!pagevec_add(pvec, mlock_lru(&folio->page)) || + folio_test_large(folio) || lru_cache_disabled()) + mlock_pagevec(pvec); + local_unlock(&mlock_pvec.lock); +} + +/** + * mlock_new_page - mlock a newly allocated page not yet on LRU + * @page: page to be mlocked, either a normal page or a THP head. + */ +void mlock_new_page(struct page *page) +{ + struct pagevec *pvec; + int nr_pages = thp_nr_pages(page); + + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); + SetPageMlocked(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); + + get_page(page); + if (!pagevec_add(pvec, mlock_new(page)) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + local_unlock(&mlock_pvec.lock); +} + +/** + * munlock_page - munlock a page + * @page: page to be munlocked, either a normal page or a THP head. + */ +void munlock_page(struct page *page) +{ + struct pagevec *pvec; + + local_lock(&mlock_pvec.lock); + pvec = this_cpu_ptr(&mlock_pvec.vec); + /* + * TestClearPageMlocked(page) must be left to __munlock_page(), + * which will check whether the page is multiply mlocked. + */ + + get_page(page); + if (!pagevec_add(pvec, page) || + PageHead(page) || lru_cache_disabled()) + mlock_pagevec(pvec); + local_unlock(&mlock_pvec.lock); +} + +static int mlock_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *start_pte, *pte; + struct page *page; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (!pmd_present(*pmd)) + goto out; + if (is_huge_zero_pmd(*pmd)) + goto out; + page = pmd_page(*pmd); + if (vma->vm_flags & VM_LOCKED) + mlock_folio(page_folio(page)); + else + munlock_page(page); + goto out; + } + + start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { + if (!pte_present(*pte)) + continue; + page = vm_normal_page(vma, addr, *pte); + if (!page || is_zone_device_page(page)) + continue; + if (PageTransCompound(page)) + continue; + if (vma->vm_flags & VM_LOCKED) + mlock_folio(page_folio(page)); + else + munlock_page(page); + } + pte_unmap(start_pte); +out: + spin_unlock(ptl); + cond_resched(); + return 0; +} + +/* + * mlock_vma_pages_range() - mlock any pages already in the range, + * or munlock all pages in the range. + * @vma - vma containing range to be mlock()ed or munlock()ed + * @start - start address in @vma of the range + * @end - end of range in @vma + * @newflags - the new set of flags for @vma. + * + * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED; + * called for munlock() and munlockall(), to clear VM_LOCKED from @vma. + */ +static void mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, vm_flags_t newflags) +{ + static const struct mm_walk_ops mlock_walk_ops = { + .pmd_entry = mlock_pte_range, + }; + + /* + * There is a slight chance that concurrent page migration, + * or page reclaim finding a page of this now-VM_LOCKED vma, + * will call mlock_vma_page() and raise page's mlock_count: + * double counting, leaving the page unevictable indefinitely. + * Communicate this danger to mlock_vma_page() with VM_IO, + * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas. + * mmap_lock is held in write mode here, so this weird + * combination should not be visible to other mmap_lock users; + * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED. + */ + if (newflags & VM_LOCKED) + newflags |= VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); + + lru_add_drain(); + walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL); + lru_add_drain(); + + if (newflags & VM_IO) { + newflags &= ~VM_IO; + WRITE_ONCE(vma->vm_flags, newflags); + } +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes. + * + * For vmas that pass the filters, merge/split as appropriate. + */ +static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end, vm_flags_t newflags) +{ + struct mm_struct *mm = vma->vm_mm; + pgoff_t pgoff; + int nr_pages; + int ret = 0; + vm_flags_t oldflags = vma->vm_flags; + + if (newflags == oldflags || (oldflags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || + vma_is_dax(vma) || vma_is_secretmem(vma)) + /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + goto out; + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (*prev) { + vma = *prev; + goto success; + } + + if (start != vma->vm_start) { + ret = split_vma(mm, vma, start, 1); + if (ret) + goto out; + } + + if (end != vma->vm_end) { + ret = split_vma(mm, vma, end, 0); + if (ret) + goto out; + } + +success: + /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!(newflags & VM_LOCKED)) + nr_pages = -nr_pages; + else if (oldflags & VM_LOCKED) + nr_pages = 0; + mm->locked_vm += nr_pages; + + /* + * vm_flags is protected by the mmap_lock held in write mode. + * It's okay if try_to_unmap_one unmaps a page just after we + * set VM_LOCKED, populate_vma_page_range will bring it back. + */ + + if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { + /* No work to do, and mlocking twice would be wrong */ + vma->vm_flags = newflags; + } else { + mlock_vma_pages_range(vma, start, end, newflags); + } +out: + *prev = vma; + return ret; +} + +static int apply_vma_lock_flags(unsigned long start, size_t len, + vm_flags_t flags) +{ + unsigned long nstart, end, tmp; + struct vm_area_struct *vma, *prev; + int error; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); + + VM_BUG_ON(offset_in_page(start)); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + if (end < start) + return -EINVAL; + if (end == start) + return 0; + vma = mas_walk(&mas); + if (!vma) + return -ENOMEM; + + if (start > vma->vm_start) + prev = vma; + else + prev = mas_prev(&mas, 0); + + for (nstart = start ; ; ) { + vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + + newflags |= flags; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + error = mlock_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + break; + nstart = tmp; + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + break; + + vma = find_vma(prev->vm_mm, prev->vm_end); + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + } + return error; +} + +/* + * Go through vma areas and sum size of mlocked + * vma pages, as return value. + * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT) + * is also counted. + * Return value: previously mlocked page counts + */ +static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, + unsigned long start, size_t len) +{ + struct vm_area_struct *vma; + unsigned long count = 0; + unsigned long end; + VMA_ITERATOR(vmi, mm, start); + + /* Don't overflow past ULONG_MAX */ + if (unlikely(ULONG_MAX - len < start)) + end = ULONG_MAX; + else + end = start + len; + + for_each_vma_range(vmi, vma, end) { + if (vma->vm_flags & VM_LOCKED) { + if (start > vma->vm_start) + count -= (start - vma->vm_start); + if (end < vma->vm_end) { + count += end - vma->vm_start; + break; + } + count += vma->vm_end - vma->vm_start; + } + } + + return count >> PAGE_SHIFT; +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) +{ + unsigned long locked; + unsigned long lock_limit; + int error = -ENOMEM; + + start = untagged_addr(start); + + if (!can_do_mlock()) + return -EPERM; + + len = PAGE_ALIGN(len + (offset_in_page(start))); + start &= PAGE_MASK; + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = len >> PAGE_SHIFT; + + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + + locked += current->mm->locked_vm; + if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) { + /* + * It is possible that the regions requested intersect with + * previously mlocked areas, that part area in "mm->locked_vm" + * should not be counted to new mlock increment count. So check + * and adjust locked count if necessary. + */ + locked -= count_mm_mlocked_page_nr(current->mm, + start, len); + } + + /* check against resource limits */ + if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) + error = apply_vma_lock_flags(start, len, flags); + + mmap_write_unlock(current->mm); + if (error) + return error; + + error = __mm_populate(start, len, 0); + if (error) + return __mlock_posix_error_return(error); + return 0; +} + +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +{ + return do_mlock(start, len, VM_LOCKED); +} + +SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) +{ + vm_flags_t vm_flags = VM_LOCKED; + + if (flags & ~MLOCK_ONFAULT) + return -EINVAL; + + if (flags & MLOCK_ONFAULT) + vm_flags |= VM_LOCKONFAULT; + + return do_mlock(start, len, vm_flags); +} + +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) +{ + int ret; + + start = untagged_addr(start); + + len = PAGE_ALIGN(len + (offset_in_page(start))); + start &= PAGE_MASK; + + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + ret = apply_vma_lock_flags(start, len, 0); + mmap_write_unlock(current->mm); + + return ret; +} + +/* + * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) + * and translate into the appropriate modifications to mm->def_flags and/or the + * flags for all current VMAs. + * + * There are a couple of subtleties with this. If mlockall() is called multiple + * times with different flags, the values do not necessarily stack. If mlockall + * is called once including the MCL_FUTURE flag and then a second time without + * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. + */ +static int apply_mlockall_flags(int flags) +{ + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + struct vm_area_struct *vma, *prev = NULL; + vm_flags_t to_add = 0; + + current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + if (flags & MCL_FUTURE) { + current->mm->def_flags |= VM_LOCKED; + + if (flags & MCL_ONFAULT) + current->mm->def_flags |= VM_LOCKONFAULT; + + if (!(flags & MCL_CURRENT)) + goto out; + } + + if (flags & MCL_CURRENT) { + to_add |= VM_LOCKED; + if (flags & MCL_ONFAULT) + to_add |= VM_LOCKONFAULT; + } + + mas_for_each(&mas, vma, ULONG_MAX) { + vm_flags_t newflags; + + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags |= to_add; + + /* Ignore errors */ + mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mas_pause(&mas); + cond_resched(); + } +out: + return 0; +} + +SYSCALL_DEFINE1(mlockall, int, flags) +{ + unsigned long lock_limit; + int ret; + + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) || + flags == MCL_ONFAULT) + return -EINVAL; + + if (!can_do_mlock()) + return -EPERM; + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + + ret = -ENOMEM; + if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || + capable(CAP_IPC_LOCK)) + ret = apply_mlockall_flags(flags); + mmap_write_unlock(current->mm); + if (!ret && (flags & MCL_CURRENT)) + mm_populate(0, TASK_SIZE); + + return ret; +} + +SYSCALL_DEFINE0(munlockall) +{ + int ret; + + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + ret = apply_mlockall_flags(0); + mmap_write_unlock(current->mm); + return ret; +} + +/* + * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB + * shm segments) get accounted against the user_struct instead. + */ +static DEFINE_SPINLOCK(shmlock_user_lock); + +int user_shm_lock(size_t size, struct ucounts *ucounts) +{ + unsigned long lock_limit, locked; + long memlock; + int allowed = 0; + + locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + lock_limit = rlimit(RLIMIT_MEMLOCK); + if (lock_limit != RLIM_INFINITY) + lock_limit >>= PAGE_SHIFT; + spin_lock(&shmlock_user_lock); + memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + + if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + goto out; + } + if (!get_ucounts(ucounts)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + allowed = 0; + goto out; + } + allowed = 1; +out: + spin_unlock(&shmlock_user_lock); + return allowed; +} + +void user_shm_unlock(size_t size, struct ucounts *ucounts) +{ + spin_lock(&shmlock_user_lock); + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); + spin_unlock(&shmlock_user_lock); + put_ucounts(ucounts); +} diff --git a/mm/mm_init.c b/mm/mm_init.c new file mode 100644 index 000000000..0d7b2bd24 --- /dev/null +++ b/mm/mm_init.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm_init.c - Memory initialisation verification and debugging + * + * Copyright 2008 IBM Corporation, 2008 + * Author Mel Gorman + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#ifdef CONFIG_DEBUG_MEMORY_INIT +int __meminitdata mminit_loglevel; + +/* The zonelists are simply reported, validation is manual. */ +void __init mminit_verify_zonelist(void) +{ + int nid; + + if (mminit_loglevel < MMINIT_VERIFY) + return; + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + struct zoneref *z; + struct zonelist *zonelist; + int i, listid, zoneid; + + BUILD_BUG_ON(MAX_ZONELISTS > 2); + for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { + + /* Identify the zone and nodelist */ + zoneid = i % MAX_NR_ZONES; + listid = i / MAX_NR_ZONES; + zonelist = &pgdat->node_zonelists[listid]; + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Print information about the zonelist */ + printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", + listid > 0 ? "thisnode" : "general", nid, + zone->name); + + /* Iterate the zonelist */ + for_each_zone_zonelist(zone, z, zonelist, zoneid) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); + pr_cont("\n"); + } + } +} + +void __init mminit_verify_pageflags_layout(void) +{ + int shift, width; + unsigned long or_mask, add_mask; + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, + LRU_GEN_WIDTH, + LRU_REFS_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", + SECTIONS_SHIFT, + NODES_SHIFT, + ZONES_SHIFT, + LAST_CPUPID_SHIFT, + KASAN_TAG_WIDTH); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", + "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", + (unsigned long)SECTIONS_PGSHIFT, + (unsigned long)NODES_PGSHIFT, + (unsigned long)ZONES_PGSHIFT, + (unsigned long)LAST_CPUPID_PGSHIFT, + (unsigned long)KASAN_TAG_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", + "Node/Zone ID: %lu -> %lu\n", + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), + (unsigned long)ZONEID_PGOFF); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", + "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", + shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); +#ifdef NODE_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Node not in page flags"); +#endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Last cpupid not in page flags"); +#endif + + if (SECTIONS_WIDTH) { + shift -= SECTIONS_WIDTH; + BUG_ON(shift != SECTIONS_PGSHIFT); + } + if (NODES_WIDTH) { + shift -= NODES_WIDTH; + BUG_ON(shift != NODES_PGSHIFT); + } + if (ZONES_WIDTH) { + shift -= ZONES_WIDTH; + BUG_ON(shift != ZONES_PGSHIFT); + } + + /* Check for bitmask overlaps */ + or_mask = (ZONES_MASK << ZONES_PGSHIFT) | + (NODES_MASK << NODES_PGSHIFT) | + (SECTIONS_MASK << SECTIONS_PGSHIFT); + add_mask = (ZONES_MASK << ZONES_PGSHIFT) + + (NODES_MASK << NODES_PGSHIFT) + + (SECTIONS_MASK << SECTIONS_PGSHIFT); + BUG_ON(or_mask != add_mask); +} + +static __init int set_mminit_loglevel(char *str) +{ + get_option(&str, &mminit_loglevel); + return 0; +} +early_param("mminit_loglevel", set_mminit_loglevel); +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +struct kobject *mm_kobj; +EXPORT_SYMBOL_GPL(mm_kobj); + +#ifdef CONFIG_SMP +s32 vm_committed_as_batch = 32; + +void mm_compute_batch(int overcommit_policy) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr*2, 32); + unsigned long ram_pages = totalram_pages(); + + /* + * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of + * (total memory/#cpus), and lift it to 25% for other policies + * to easy the possible lock contention for percpu_counter + * vm_committed_as, while the max limit is INT_MAX + */ + if (overcommit_policy == OVERCOMMIT_NEVER) + memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX); + else + memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX); + + vm_committed_as_batch = max_t(s32, memsized_batch, batch); +} + +static int __meminit mm_compute_batch_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + mm_compute_batch(sysctl_overcommit_memory); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block compute_batch_nb __meminitdata = { + .notifier_call = mm_compute_batch_notifier, + .priority = IPC_CALLBACK_PRI, /* use lowest priority */ +}; + +static int __init mm_compute_batch_init(void) +{ + mm_compute_batch(sysctl_overcommit_memory); + register_hotmemory_notifier(&compute_batch_nb); + + return 0; +} + +__initcall(mm_compute_batch_init); + +#endif + +static int __init mm_sysfs_init(void) +{ + mm_kobj = kobject_create_and_add("mm", kernel_kobj); + if (!mm_kobj) + return -ENOMEM; + + return 0; +} +postcore_initcall(mm_sysfs_init); diff --git a/mm/mm_slot.h b/mm/mm_slot.h new file mode 100644 index 000000000..83f18ed1c --- /dev/null +++ b/mm/mm_slot.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _LINUX_MM_SLOT_H +#define _LINUX_MM_SLOT_H + +#include +#include + +/* + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: link to the mm_slots hash list + * @mm_node: link into the mm_slots list + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +#define mm_slot_entry(ptr, type, member) \ + container_of(ptr, type, member) + +static inline void *mm_slot_alloc(struct kmem_cache *cache) +{ + if (!cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(cache, GFP_KERNEL); +} + +static inline void mm_slot_free(struct kmem_cache *cache, void *objp) +{ + kmem_cache_free(cache, objp); +} + +#define mm_slot_lookup(_hashtable, _mm) \ +({ \ + struct mm_slot *tmp_slot, *mm_slot = NULL; \ + \ + hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \ + if (_mm == tmp_slot->mm) { \ + mm_slot = tmp_slot; \ + break; \ + } \ + \ + mm_slot; \ +}) + +#define mm_slot_insert(_hashtable, _mm, _mm_slot) \ +({ \ + _mm_slot->mm = _mm; \ + hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \ +}) + +#endif /* _LINUX_MM_SLOT_H */ diff --git a/mm/mmap.c b/mm/mmap.c new file mode 100644 index 000000000..c0f957549 --- /dev/null +++ b/mm/mmap.c @@ -0,0 +1,3901 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/mmap.c + * + * Written by obz. + * + * Address space accounting code + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include "internal.h" + +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + +static bool ignore_rlimit_data; +core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); + +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, + struct vm_area_struct *vma, struct vm_area_struct *prev, + struct vm_area_struct *next, unsigned long start, + unsigned long end); + +static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +void vma_set_page_prot(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; + + vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma, vm_page_prot)) { + vm_flags &= ~VM_SHARED; + vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); + } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); +} + +/* + * Requires inode->i_mapping->i_mmap_rwsem + */ +static void __remove_shared_vm_struct(struct vm_area_struct *vma, + struct file *file, struct address_space *mapping) +{ + if (vma->vm_flags & VM_SHARED) + mapping_unmap_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * Unlink a file-based vm structure from its interval tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. + */ +void unlink_file_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct address_space *mapping = file->f_mapping; + i_mmap_lock_write(mapping); + __remove_shared_vm_struct(vma, file, mapping); + i_mmap_unlock_write(mapping); + } +} + +/* + * Close a vm structure and free it. + */ +static void remove_vma(struct vm_area_struct *vma) +{ + might_sleep(); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); + vm_area_free(vma); +} + +/* + * check_brk_limits() - Use platform specific check of range & verify mlock + * limits. + * @addr: The address to check + * @len: The size of increase. + * + * Return: 0 on success. + */ +static int check_brk_limits(unsigned long addr, unsigned long len) +{ + unsigned long mapped_addr; + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; + + return mlock_future_check(current->mm, current->mm->def_flags, len); +} +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, unsigned long flags); +SYSCALL_DEFINE1(brk, unsigned long, brk) +{ + unsigned long newbrk, oldbrk, origbrk; + struct mm_struct *mm = current->mm; + struct vm_area_struct *brkvma, *next = NULL; + unsigned long min_brk; + bool populate; + bool downgraded = false; + LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + origbrk = mm->brk; + +#ifdef CONFIG_COMPAT_BRK + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (current->brk_randomized) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; +#else + min_brk = mm->start_brk; +#endif + if (brk < min_brk) + goto out; + + /* + * Check against rlimit here. If this check is done later after the test + * of oldbrk with newbrk then it can escape the test and let the data + * segment grow beyond its set limit the in case where the limit is + * not page aligned -Ram Gupta + */ + if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, + mm->end_data, mm->start_data)) + goto out; + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; + } + + /* + * Always allow shrinking brk. + * do_brk_munmap() may downgrade mmap_lock to read. + */ + if (brk <= mm->brk) { + int ret; + + /* Search one past newbrk */ + mas_set(&mas, newbrk); + brkvma = mas_find(&mas, oldbrk); + if (!brkvma || brkvma->vm_start >= oldbrk) + goto out; /* mapping intersects with an existing non-brk vma. */ + /* + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). + */ + mm->brk = brk; + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { + downgraded = true; + goto success; + } else if (!ret) + goto success; + + mm->brk = origbrk; + goto out; + } + + if (check_brk_limits(oldbrk, newbrk - oldbrk)) + goto out; + + /* + * Only check if the next VMA is within the stack_guard_gap of the + * expansion area + */ + mas_set(&mas, oldbrk); + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); + if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) + goto out; + + brkvma = mas_prev(&mas, mm->start_brk); + /* Ok, looks good - let it rip. */ + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) + goto out; + + mm->brk = brk; + +success: + populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; + if (downgraded) + mmap_read_unlock(mm); + else + mmap_write_unlock(mm); + userfaultfd_unmap_complete(mm, &uf); + if (populate) + mm_populate(oldbrk, newbrk - oldbrk); + return brk; + +out: + mmap_write_unlock(mm); + return origbrk; +} + +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) +extern void mt_validate(struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt); + +/* Validate the maple tree */ +static void validate_mm_mt(struct mm_struct *mm) +{ + struct maple_tree *mt = &mm->mm_mt; + struct vm_area_struct *vma_mt; + + MA_STATE(mas, mt, 0, 0); + + mt_validate(&mm->mm_mt); + mas_for_each(&mas, vma_mt, ULONG_MAX) { + if ((vma_mt->vm_start != mas.index) || + (vma_mt->vm_end - 1 != mas.last)) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); + dump_vma(vma_mt); + pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, + mas.index, mas.last); + pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, + vma_mt->vm_start, vma_mt->vm_end); + + mt_dump(mas.tree); + if (vma_mt->vm_end != mas.last + 1) { + pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", + mm, vma_mt->vm_start, vma_mt->vm_end, + mas.index, mas.last); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); + if (vma_mt->vm_start != mas.index) { + pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", + mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); + } + } +} + +static void validate_mm(struct mm_struct *mm) +{ + int bug = 0; + int i = 0; + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + validate_mm_mt(mm); + + mas_for_each(&mas, vma, ULONG_MAX) { +#ifdef CONFIG_DEBUG_VM_RB + struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; + + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } +#endif + i++; + } + if (i != mm->map_count) { + pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); + bug = 1; + } + VM_BUG_ON_MM(bug, mm); +} + +#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ +#define validate_mm_mt(root) do { } while (0) +#define validate_mm(mm) do { } while (0) +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ + +/* + * vma has some anon_vma assigned, and is already inserted on that + * anon_vma's interval trees. + * + * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the + * vma must be removed from the anon_vma's interval trees using + * anon_vma_interval_tree_pre_update_vma(). + * + * After the update, the vma will be reinserted using + * anon_vma_interval_tree_post_update_vma(). + * + * The entire update must be protected by exclusive mmap_lock and by + * the root anon_vma's mutex. + */ +static inline void +anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); +} + +static inline void +anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc; + + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); +} + +static unsigned long count_vma_pages_range(struct mm_struct *mm, + unsigned long addr, unsigned long end) +{ + VMA_ITERATOR(vmi, mm, addr); + struct vm_area_struct *vma; + unsigned long nr_pages = 0; + + for_each_vma_range(vmi, vma, end) { + unsigned long vm_start = max(addr, vma->vm_start); + unsigned long vm_end = min(end, vma->vm_end); + + nr_pages += PHYS_PFN(vm_end - vm_start); + } + + return nr_pages; +} + +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) +{ + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(mapping); + + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); +} + +/* + * vma_mas_store() - Store a VMA in the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to store a VMA in the maple tree when the @mas has already + * walked to the correct location. + * + * Note: the end address is inclusive in the maple tree. + */ +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + trace_vma_store(mas->tree, vma); + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} + +/* + * vma_mas_remove() - Remove a VMA from the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to remove a VMA from the maple tree when the @mas has already + * been established and points to the correct location. + * Note: the end address is inclusive in the maple tree. + */ +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} + +/* + * vma_mas_szero() - Set a given range to zero. Used when modifying a + * vm_area_struct start or end. + * + * @mas: The maple tree ma_state + * @start: The start address to zero + * @end: The end address to zero. + */ +static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, + unsigned long end) +{ + trace_vma_mas_szero(mas->tree, start, end - 1); + mas_set_range(mas, start, end - 1); + mas_store_prealloc(mas, NULL); +} + +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct address_space *mapping = NULL; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + i_mmap_lock_write(mapping); + } + + vma_mas_store(vma, &mas); + + if (mapping) { + __vma_link_file(vma, mapping); + i_mmap_unlock_write(mapping); + } + + mm->map_count++; + validate_mm(mm); + return 0; +} + +/* + * vma_expand - Expand an existing VMA + * + * @mas: The maple state + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = vma->anon_vma; + struct file *file = vma->vm_file; + bool remove_next = false; + struct vm_area_struct *anon_dup = NULL; + + if (next && (vma != next) && (end == next->vm_end)) { + remove_next = true; + if (next->anon_vma && !vma->anon_vma) { + int error; + + anon_vma = next->anon_vma; + vma->anon_vma = anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + + anon_dup = vma; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + i_mmap_lock_write(mapping); + } + + if (anon_vma) { + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + + if (file) { + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_mas_store(vma, mas); + + if (file) { + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + /* Expanding over the next vma */ + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); + } + + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(anon_vma); + } + + if (file) { + i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + } + + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + vm_area_free(next); + } + + validate_mm(mm); + return 0; + +nomem: + if (anon_dup) + unlink_anon_vmas(anon_dup); + + return -ENOMEM; +} + +/* + * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that + * is already present in an i_mmap tree without adjusting the tree. + * The following helper function should be used when such adjustments + * are necessary. The "insert" vma (if any) is to be inserted + * before we drop the necessary locks. + */ +int __vma_adjust(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert, + struct vm_area_struct *expand) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *orig_vma = vma; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = NULL; + struct file *file = vma->vm_file; + bool vma_changed = false; + long adjust_next = 0; + int remove_next = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vm_area_struct *exporter = NULL, *importer = NULL; + struct vm_area_struct *anon_dup = NULL; + + if (next && !insert) { + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and + * perhaps the one after too (mprotect case 6). + * The only other cases that gets here are + * case 1, case 7 and case 8. + */ + if (next == expand) { + /* + * The only case where we don't expand "vma" + * and we expand "next" instead is case 8. + */ + VM_WARN_ON(end != next->vm_end); + /* + * remove_next == 3 means we're + * removing "vma" and that to do so we + * swapped "vma" and "next". + */ + remove_next = 3; + VM_WARN_ON(file != next->vm_file); + swap(vma, next); + } else { + VM_WARN_ON(expand != vma); + /* + * case 1, 6, 7, remove_next == 2 is case 6, + * remove_next == 1 is case 1 or 7. + */ + remove_next = 1 + (end > next->vm_end); + if (remove_next == 2) + next_next = find_vma(mm, next->vm_end); + + VM_WARN_ON(remove_next == 2 && + end != next_next->vm_end); + } + + exporter = next; + importer = vma; + + /* + * If next doesn't have anon_vma, import from vma after + * next, if the vma overlaps with it. + */ + if (remove_next == 2 && !next->anon_vma) + exporter = next_next; + + } else if (end > next->vm_start) { + /* + * vma expands, overlapping part of the next: + * mprotect case 5 shifting the boundary up. + */ + adjust_next = (end - next->vm_start); + exporter = next; + importer = vma; + VM_WARN_ON(expand != importer); + } else if (end < vma->vm_end) { + /* + * vma shrinks, and !insert tells it's not + * split_vma inserting another: so it must be + * mprotect case 4 shifting the boundary down. + */ + adjust_next = -(vma->vm_end - end); + exporter = vma; + importer = next; + VM_WARN_ON(expand != importer); + } + + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (exporter && exporter->anon_vma && !importer->anon_vma) { + int error; + + importer->anon_vma = exporter->anon_vma; + error = anon_vma_clone(importer, exporter); + if (error) + return error; + + anon_dup = importer; + } + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + if (anon_dup) + unlink_anon_vmas(anon_dup); + + return -ENOMEM; + } + + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + + if (adjust_next) + uprobe_munmap(next, next->vm_start, next->vm_end); + + i_mmap_lock_write(mapping); + if (insert && insert->vm_file) { + /* + * Put into interval tree now, so instantiated pages + * are visible to arm/parisc __flush_dcache_page + * throughout; but we cannot insert into address + * space until vma start or end is updated. + */ + __vma_link_file(insert, insert->vm_file->f_mapping); + } + } + + anon_vma = vma->anon_vma; + if (!anon_vma && adjust_next) + anon_vma = next->anon_vma; + if (anon_vma) { + VM_WARN_ON(adjust_next && next->anon_vma && + anon_vma != next->anon_vma); + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_pre_update_vma(next); + } + + if (file) { + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + if (adjust_next) + vma_interval_tree_remove(next, root); + } + + if (start != vma->vm_start) { + if ((vma->vm_start < start) && + (!insert || (insert->vm_end != start))) { + vma_mas_szero(&mas, vma->vm_start, start); + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } else { + vma_changed = true; + } + vma->vm_start = start; + } + if (end != vma->vm_end) { + if (vma->vm_end > end) { + if ((vma->vm_end + adjust_next != end) && + (!insert || (insert->vm_start != end))) { + vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); + VM_WARN_ON(insert && + insert->vm_end < vma->vm_end); + } + } else { + vma_changed = true; + } + vma->vm_end = end; + } + + if (vma_changed) + vma_mas_store(vma, &mas); + + vma->vm_pgoff = pgoff; + if (adjust_next) { + next->vm_start += adjust_next; + next->vm_pgoff += adjust_next >> PAGE_SHIFT; + vma_mas_store(next, &mas); + } + + if (file) { + if (adjust_next) + vma_interval_tree_insert(next, root); + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); + if (remove_next == 2) + __remove_shared_vm_struct(next_next, file, mapping); + } else if (insert) { + /* + * split_vma has split insert from vma, and needs + * us to insert it before dropping the locks + * (it may either follow vma or precede it). + */ + mas_reset(&mas); + vma_mas_store(insert, &mas); + mm->map_count++; + } + + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + if (adjust_next) + anon_vma_interval_tree_post_update_vma(next); + anon_vma_unlock_write(anon_vma); + } + + if (file) { + i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + + if (adjust_next) + uprobe_mmap(next); + } + + if (remove_next) { +again: + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + if (remove_next != 2) + BUG_ON(vma->vm_end < next->vm_end); + vm_area_free(next); + + /* + * In mprotect's case 6 (see comments on vma_merge), + * we must remove next_next too. + */ + if (remove_next == 2) { + remove_next = 1; + next = next_next; + goto again; + } + } + if (insert && file) + uprobe_mmap(insert); + + mas_destroy(&mas); + validate_mm(mm); + + return 0; +} + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + /* + * VM_SOFTDIRTY should not prevent from VMA merging, if we + * match the flags but dirty bit -- the caller should mark + * merged VMA as dirty. If dirty bit won't be excluded from + * comparison, we increase pressure on the memory system forcing + * the kernel to generate new VMAs when old one could be + * extended instead. + */ + if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_ops && vma->vm_ops->close) + return 0; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + return 0; + if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) + return 0; + return 1; +} + +static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2, + struct vm_area_struct *vma) +{ + /* + * The list_is_singular() test is to avoid merging VMA cloned from + * parents. This can improve scalability caused by anon_vma lock. + */ + if ((!anon_vma1 || !anon_vma2) && (!vma || + list_is_singular(&vma->anon_vma_chain))) + return 1; + return anon_vma1 == anon_vma2; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + */ +static int +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { + if (vma->vm_pgoff == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + * + * We cannot merge two vmas if they have differently assigned (non-NULL) + * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + */ +static int +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && + is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { + pgoff_t vm_pglen; + vm_pglen = vma_pages(vma); + if (vma->vm_pgoff + vm_pglen == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). + * + * In most cases - when called for mmap, brk or mremap - [addr,end) is + * certain not to be mapped by the time vma_merge is called; but when + * called for mprotect, it is certain to be already mapped (either at + * an offset within prev, or at the start of next), and the flags of + * this area are about to be changed to vm_flags - and the no-change + * case has already been eliminated. + * + * The following mprotect cases have to be considered, where AAAA is + * the area passed down from mprotect_fixup, never extending beyond one + * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * + * AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN + * cannot merge might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN + * mmap, brk or case 4 below case 5 below + * mremap move: + * AAAA AAAA + * PPPP NNNN PPPPNNNNXXXX + * might become might become + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or + * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or + * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8 + * + * It is important for case 8 that the vma NNNN overlapping the + * region AAAA is never going to extended over XXXX. Instead XXXX must + * be extended in region AAAA and NNNN must be removed. This way in + * all cases where vma_merge succeeds, the moment vma_adjust drops the + * rmap_locks, the properties of the merged vma will be already + * correct for the whole merged range. Some of those properties like + * vm_page_prot/vm_flags may be accessed by rmap_walks and they must + * be correct for the whole merged range immediately after the + * rmap_locks are released. Otherwise if XXXX would be removed and + * NNNN would be extended over the XXXX range, remove_migration_ptes + * or other rmap walkers (if working on addresses beyond the "end" + * parameter) may establish ptes with the wrong permissions of NNNN + * instead of the right permissions of XXXX. + */ +struct vm_area_struct *vma_merge(struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) +{ + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; + struct vm_area_struct *mid, *next, *res; + int err = -1; + bool merge_prev = false; + bool merge_next = false; + + /* + * We later require that vma->vm_flags == vm_flags, + * so this tests vma->vm_flags & VM_SPECIAL, too. + */ + if (vm_flags & VM_SPECIAL) + return NULL; + + next = find_vma(mm, prev ? prev->vm_end : 0); + mid = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ + next = find_vma(mm, next->vm_end); + + /* verify some invariant that must be enforced by the caller */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(mid && end > mid->vm_end); + VM_WARN_ON(addr >= end); + + /* Can we merge the predecessor? */ + if (prev && prev->vm_end == addr && + mpol_equal(vma_policy(prev), policy) && + can_vma_merge_after(prev, vm_flags, + anon_vma, file, pgoff, + vm_userfaultfd_ctx, anon_name)) { + merge_prev = true; + } + /* Can we merge the successor? */ + if (next && end == next->vm_start && + mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx, anon_name)) { + merge_next = true; + } + /* Can we merge both the predecessor and the successor? */ + if (merge_prev && merge_next && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { /* cases 1, 6 */ + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); + res = prev; + } else if (merge_prev) { /* cases 2, 5, 7 */ + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); + res = prev; + } else if (merge_next) { + if (prev && addr < prev->vm_end) /* case 4 */ + err = __vma_adjust(prev, prev->vm_start, + addr, prev->vm_pgoff, NULL, next); + else /* cases 3, 8 */ + err = __vma_adjust(mid, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + res = next; + } + + /* + * Cannot merge with predecessor or successor or error in __vma_adjust? + */ + if (err) + return NULL; + khugepaged_enter_vma(res, vm_flags); + return res; +} + +/* + * Rough compatibility check to quickly see if it's even worth looking + * at sharing an anon_vma. + * + * They need to have the same vm_file, and the flags can only differ + * in things that mprotect may change. + * + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that + * we can merge the two vma's. For example, we refuse to merge a vma if + * there is a vm_ops->close() function, because that indicates that the + * driver is doing some kind of reference counting. But that doesn't + * really matter for the anon_vma sharing case. + */ +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) +{ + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); +} + +/* + * Do some basic sanity checking to see if we can re-use the anon_vma + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be + * the same as 'old', the other will be the new one that is trying + * to share the anon_vma. + * + * NOTE! This runs with mmap_lock held for reading, so it is possible that + * the anon_vma of 'old' is concurrently in the process of being set up + * by another page fault trying to merge _that_. But that's ok: if it + * is being set up, that automatically means that it will be a singleton + * acceptable for merging, so we can do all of this optimistically. But + * we do that READ_ONCE() to make sure that we never re-load the pointer. + * + * IOW: that the "list_is_singular()" test on the anon_vma_chain only + * matters for the 'stable anon_vma' case (ie the thing we want to avoid + * is to return an anon_vma that is "complex" due to having gone through + * a fork). + * + * We also make sure that the two vma's are compatible (adjacent, + * and with the same memory policies). That's all stable, even with just + * a read lock on the mmap_lock. + */ +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) +{ + if (anon_vma_compatible(a, b)) { + struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); + + if (anon_vma && list_is_singular(&old->anon_vma_chain)) + return anon_vma; + } + return NULL; +} + +/* + * find_mergeable_anon_vma is used by anon_vma_prepare, to check + * neighbouring vmas for a suitable anon_vma, before it goes off + * to allocate a new anon_vma. It checks because a repetitive + * sequence of mprotects and faults may otherwise lead to distinct + * anon_vmas being allocated, preventing vma merge in subsequent + * mprotect. + */ +struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); + struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; + + /* Try next first. */ + next = mas_walk(&mas); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); + if (anon_vma) + return anon_vma; + } + + prev = mas_prev(&mas, 0); + VM_BUG_ON_VMA(prev != vma, vma); + prev = mas_prev(&mas, 0); + /* Try prev next. */ + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); + + /* + * We might reach here with anon_vma == NULL if we can't find + * any reusable anon_vma. + * There's no absolute need to look only at touching neighbours: + * we could search further afield for "compatible" anon_vmas. + * But it would probably just be a waste of time searching, + * or lead to too many vmas hanging off the same anon_vma. + * We're trying to allow mprotect remerging later on, + * not trying to minimize memory used for anon_vmas. + */ + return anon_vma; +} + +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len) +{ + unsigned long locked, lock_limit; + + /* mlock MCL_FUTURE? */ + if (flags & VM_LOCKED) { + locked = len >> PAGE_SHIFT; + locked += mm->locked_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + return -EAGAIN; + } + return 0; +} + +static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + return MAX_LFS_FILESIZE; + + if (S_ISBLK(inode->i_mode)) + return MAX_LFS_FILESIZE; + + if (S_ISSOCK(inode->i_mode)) + return MAX_LFS_FILESIZE; + + /* Special "we do even unsigned file positions" case */ + if (file->f_mode & FMODE_UNSIGNED_OFFSET) + return 0; + + /* Yes, random drivers might want more. But I'm tired of buggy drivers */ + return ULONG_MAX; +} + +static inline bool file_mmap_ok(struct file *file, struct inode *inode, + unsigned long pgoff, unsigned long len) +{ + u64 maxsize = file_mmap_size_max(file, inode); + + if (maxsize && len > maxsize) + return false; + maxsize -= len; + if (pgoff > maxsize >> PAGE_SHIFT) + return false; + return true; +} + +/* + * The caller must write-lock current->mm->mmap_lock. + */ +unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + vm_flags_t vm_flags; + int pkey = 0; + + validate_mm(mm); + *populate = 0; + + if (!len) + return -EINVAL; + + /* + * Does the application expect PROT_READ to imply PROT_EXEC? + * + * (the exception is when the underlying filesystem is noexec + * mounted, in which case we dont add PROT_EXEC.) + */ + if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) + if (!(file && path_noexec(&file->f_path))) + prot |= PROT_EXEC; + + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + + if (!(flags & MAP_FIXED)) + addr = round_hint_to_min(addr); + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + /* Too many mappings? */ + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (flags & MAP_FIXED_NOREPLACE) { + if (find_vma_intersection(mm, addr, addr + len)) + return -EEXIST; + } + + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(mm); + if (pkey < 0) + pkey = 0; + } + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; + + if (mlock_future_check(mm, vm_flags, len)) + return -EAGAIN; + + if (file) { + struct inode *inode = file_inode(file); + unsigned long flags_mask; + + if (!file_mmap_ok(file, inode, pgoff, len)) + return -EOVERFLOW; + + flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; + + switch (flags & MAP_TYPE) { + case MAP_SHARED: + /* + * Force use of MAP_SHARED_VALIDATE with non-legacy + * flags. E.g. MAP_SYNC is dangerous to use with + * MAP_SHARED as you don't know which consistency model + * you will get. We silently ignore unsupported flags + * with MAP_SHARED to preserve backward compatibility. + */ + flags &= LEGACY_MAP_MASK; + fallthrough; + case MAP_SHARED_VALIDATE: + if (flags & ~flags_mask) + return -EOPNOTSUPP; + if (prot & PROT_WRITE) { + if (!(file->f_mode & FMODE_WRITE)) + return -EACCES; + if (IS_SWAPFILE(file->f_mapping->host)) + return -ETXTBSY; + } + + /* + * Make sure we don't allow writing to an append-only + * file.. + */ + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + fallthrough; + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (path_noexec(&file->f_path)) { + if (vm_flags & VM_EXEC) + return -EPERM; + vm_flags &= ~VM_MAYEXEC; + } + + if (!file->f_op->mmap) + return -ENODEV; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; + break; + + default: + return -EINVAL; + } + } else { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; + /* + * Ignore pgoff. + */ + pgoff = 0; + vm_flags |= VM_SHARED | VM_MAYSHARE; + break; + case MAP_PRIVATE: + /* + * Set pgoff according to addr for anon_vma. + */ + pgoff = addr >> PAGE_SHIFT; + break; + default: + return -EINVAL; + } + } + + /* + * Set 'VM_NORESERVE' if we should not account for the + * memory use of this mapping. + */ + if (flags & MAP_NORESERVE) { + /* We honor MAP_NORESERVE if allowed to overcommit */ + if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) + vm_flags |= VM_NORESERVE; + + /* hugetlb applies strict overcommit unless MAP_NORESERVE */ + if (file && is_file_hugepages(file)) + vm_flags |= VM_NORESERVE; + } + + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + if (!IS_ERR_VALUE(addr) && + ((vm_flags & VM_LOCKED) || + (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) + *populate = len; + return addr; +} + +unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + struct file *file = NULL; + unsigned long retval; + + if (!(flags & MAP_ANONYMOUS)) { + audit_mmap_fd(fd, flags); + file = fget(fd); + if (!file) + return -EBADF; + if (is_file_hugepages(file)) { + len = ALIGN(len, huge_page_size(hstate_file(file))); + } else if (unlikely(flags & MAP_HUGETLB)) { + retval = -EINVAL; + goto out_fput; + } + } else if (flags & MAP_HUGETLB) { + struct hstate *hs; + + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + if (!hs) + return -EINVAL; + + len = ALIGN(len, huge_page_size(hs)); + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + */ + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, + VM_NORESERVE, + HUGETLB_ANONHUGE_INODE, + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: + if (file) + fput(file); + return retval; +} + +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (offset_in_page(a.offset)) + return -EINVAL; + + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + +/* + * Some shared mappings will want the pages marked read-only + * to track write events. If so, we'll downgrade vm_page_prot + * to the private version (using protection_map[] without the + * VM_SHARED bit). + */ +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) +{ + vm_flags_t vm_flags = vma->vm_flags; + const struct vm_operations_struct *vm_ops = vma->vm_ops; + + /* If it was private or non-writable, the write bit is already clear */ + if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) + return 0; + + /* The backer wishes to know when pages are first written to? */ + if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) + return 1; + + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ + if (pgprot_val(vm_page_prot) != + pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) + return 0; + + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) + return 1; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_wp(vma)) + return 1; + + /* Specialty mapping? */ + if (vm_flags & VM_PFNMAP) + return 0; + + /* Can the mapping track the dirty pages? */ + return vma->vm_file && vma->vm_file->f_mapping && + mapping_can_writeback(vma->vm_file->f_mapping); +} + +/* + * We account for memory if it's a private writeable mapping, + * not hugepages and VM_NORESERVE wasn't set. + */ +static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) +{ + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return 0; + + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +static unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap, low_limit; + struct vm_area_struct *tmp; + + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + low_limit = info->low_limit; +retry: + if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length)) + return -ENOMEM; + + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) < gap + length - 1) { + low_limit = tmp->vm_end; + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + low_limit = vm_end_gap(tmp); + mas_reset(&mas); + goto retry; + } + } + + return gap; +} + +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with * the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ +static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + unsigned long length, gap, high_limit, gap_end; + struct vm_area_struct *tmp; + + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + high_limit = info->high_limit; +retry: + if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1, + length)) + return -ENOMEM; + + gap = mas.last + 1 - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + gap_end = mas.last; + tmp = mas_next(&mas, ULONG_MAX); + if (tmp && (tmp->vm_flags & VM_GROWSDOWN)) { /* Avoid prev check if possible */ + if (vm_start_gap(tmp) <= gap_end) { + high_limit = vm_start_gap(tmp); + mas_reset(&mas); + goto retry; + } + } else { + tmp = mas_prev(&mas, 0); + if (tmp && vm_end_gap(tmp) > gap) { + high_limit = tmp->vm_start; + mas_reset(&mas); + goto retry; + } + } + + return gap; +} + +/* + * Search for an unmapped address range. + * + * We are looking for a range that: + * - does not intersect with any VMA; + * - is contained within the [low_limit, high_limit) interval; + * - is at least the desired size. + * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) + */ +unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) +{ + unsigned long addr; + + if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) + addr = unmapped_area_topdown(info); + else + addr = unmapped_area(info); + + trace_vm_unmapped_area(addr, info); + return addr; +} + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +unsigned long +generic_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); + + if (len > mmap_end - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma_prev(mm, addr, &prev); + if (mmap_end - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = mmap_end; + info.align_mask = 0; + info.align_offset = 0; + return vm_unmapped_area(&info); +} + +#ifndef HAVE_ARCH_UNMAPPED_AREA +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return generic_get_unmapped_area(filp, addr, len, pgoff, flags); +} +#endif + +/* + * This mmap-allocator allocates new areas top-down from below the + * stack's low limit (the base): + */ +unsigned long +generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct vm_area_struct *vma, *prev; + struct mm_struct *mm = current->mm; + struct vm_unmapped_area_info info; + const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); + + /* requested length too big for entire address space */ + if (len > mmap_end - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma_prev(mm, addr, &prev); + if (mmap_end - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); + info.align_mask = 0; + info.align_offset = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (offset_in_page(addr)) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = mmap_end; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); +} +#endif + +unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + unsigned long error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* Careful about overflows.. */ + if (len > TASK_SIZE) + return -ENOMEM; + + get_area = current->mm->get_unmapped_area; + if (file) { + if (file->f_op->get_unmapped_area) + get_area = file->f_op->get_unmapped_area; + } else if (flags & MAP_SHARED) { + /* + * mmap_region() will call shmem_zero_setup() to create a file, + * so use shmem's get_unmapped_area in case it can be huge. + * do_mmap() will clear pgoff, so match alignment. + */ + pgoff = 0; + get_area = shmem_get_unmapped_area; + } + + addr = get_area(file, addr, len, pgoff, flags); + if (IS_ERR_VALUE(addr)) + return addr; + + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (offset_in_page(addr)) + return -EINVAL; + + error = security_mmap_addr(addr); + return error ? error : addr; +} + +EXPORT_SYMBOL(get_unmapped_area); + +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + +/** + * find_vma() - Find the VMA for a given address, or the next VMA. + * @mm: The mm_struct to check + * @addr: The address + * + * Returns: The VMA associated with addr, or the next VMA. + * May return %NULL in the case of no VMA at addr or above. + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + unsigned long index = addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, ULONG_MAX); +} +EXPORT_SYMBOL(find_vma); + +/** + * find_vma_prev() - Find the VMA for a given address, or the next vma and + * set %pprev to the previous VMA, if any. + * @mm: The mm_struct to check + * @addr: The address + * @pprev: The pointer to set to the previous VMA + * + * Note that RCU lock is missing here since the external mmap_lock() is used + * instead. + * + * Returns: The VMA associated with @addr, or the next vma. + * May return %NULL in the case of no vma at addr or above. + */ +struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); + + vma = mas_walk(&mas); + *pprev = mas_prev(&mas, 0); + if (!vma) + vma = mas_next(&mas, ULONG_MAX); + return vma; +} + +/* + * Verify that the stack growth is acceptable and + * update accounting. This is shared with both the + * grow-up and grow-down cases. + */ +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long new_start; + + /* address space limit tests */ + if (!may_expand_vm(mm, vma->vm_flags, grow)) + return -ENOMEM; + + /* Stack limit test */ + if (size > rlimit(RLIMIT_STACK)) + return -ENOMEM; + + /* mlock limit tests */ + if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT)) + return -ENOMEM; + + /* Check to ensure the stack will not grow into a hugetlb-only region */ + new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : + vma->vm_end - size; + if (is_hugepage_only_range(vma->vm_mm, new_start, size)) + return -EFAULT; + + /* + * Overcommit.. This must be the final test, as it will + * update security statistics. + */ + if (security_vm_enough_memory_mm(mm, grow)) + return -ENOMEM; + + return 0; +} + +#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) +/* + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. + */ +static int expand_upwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; + int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= (TASK_SIZE & PAGE_MASK)) + return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); + return -ENOMEM; + } + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_lock in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address > vma->vm_end) { + unsigned long size, grow; + + size = address - vma->vm_start; + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + /* + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. + */ + spin_lock(&mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); + anon_vma_interval_tree_post_update_vma(vma); + spin_unlock(&mm->page_table_lock); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + khugepaged_enter_vma(vma, vma->vm_flags); + mas_destroy(&mas); + return error; +} +#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ + +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + * mmap_lock held for writing. + */ +int expand_downwards(struct vm_area_struct *vma, unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); + struct vm_area_struct *prev; + int error = 0; + + if (!(vma->vm_flags & VM_GROWSDOWN)) + return -EFAULT; + + address &= PAGE_MASK; + if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) + return -EPERM; + + /* Enforce stack_guard_gap */ + prev = mas_prev(&mas, 0); + /* Check that both stack segments have the same anon_vma? */ + if (prev) { + if (!(prev->vm_flags & VM_GROWSDOWN) && + vma_is_accessible(prev) && + (address - prev->vm_end < stack_guard_gap)) + return -ENOMEM; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); + return -ENOMEM; + } + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_lock in read mode. We need the + * anon_vma lock to serialize against concurrent expand_stacks. + */ + anon_vma_lock_write(vma->anon_vma); + + /* Somebody else might have raced and expanded it already */ + if (address < vma->vm_start) { + unsigned long size, grow; + + size = vma->vm_end - address; + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + /* + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. + */ + spin_lock(&mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, grow); + anon_vma_interval_tree_pre_update_vma(vma); + vma->vm_start = address; + vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); + anon_vma_interval_tree_post_update_vma(vma); + spin_unlock(&mm->page_table_lock); + + perf_event_mmap(vma); + } + } + } + anon_vma_unlock_write(vma->anon_vma); + khugepaged_enter_vma(vma, vma->vm_flags); + mas_destroy(&mas); + return error; +} + +/* enforced gap between the expanding stack and other mappings. */ +unsigned long stack_guard_gap = 256UL<comm, current->pid); + + if (prot) + return ret; + start = start & PAGE_MASK; + size = size & PAGE_MASK; + + if (start + size <= start) + return ret; + + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return ret; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + vma = vma_lookup(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) + goto out; + + if (start + size > vma->vm_end) { + VMA_ITERATOR(vmi, mm, vma->vm_end); + struct vm_area_struct *next, *prev = vma; + + for_each_vma_range(vmi, next, start + size) { + /* hole between vmas ? */ + if (next->vm_start != prev->vm_end) + goto out; + + if (next->vm_file != vma->vm_file) + goto out; + + if (next->vm_flags != vma->vm_flags) + goto out; + + if (start + size <= next->vm_end) + break; + + prev = next; + } + + if (!next) + goto out; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + + file = get_file(vma->vm_file); + ret = do_mmap(vma->vm_file, start, size, + prot, flags, pgoff, &populate, NULL); + fput(file); +out: + mmap_write_unlock(mm); + if (populate) + mm_populate(ret, populate); + if (!IS_ERR_VALUE(ret)) + ret = 0; + return ret; +} + +/* + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. + */ +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) +{ + struct mm_struct *mm = vma->vm_mm; + int ret; + + arch_unmap(mm, newbrk, oldbrk); + ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); + validate_mm_mt(mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + + validate_mm_mt(mm); + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + return -ENOMEM; + + if (mm->map_count > sysctl_max_map_count) + return -ENOMEM; + + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) + return -ENOMEM; + + /* + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. + */ + if (vma && vma->vm_end == addr && !vma_policy(vma) && + can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + mas_store_prealloc(mas, vma); + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma(vma, flags); + goto out; + } + + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto vma_alloc_fail; + + vma_set_anonymous(vma); + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_pgoff = addr >> PAGE_SHIFT; + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_store_fail; + + mm->map_count++; +out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) + mm->locked_vm += (len >> PAGE_SHIFT); + vma->vm_flags |= VM_SOFTDIRTY; + validate_mm(mm); + return 0; + +mas_store_fail: + vm_area_free(vma); +vma_alloc_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; +} + +int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + unsigned long len; + int ret; + bool populate; + LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); + + len = PAGE_ALIGN(request); + if (len < request) + return -ENOMEM; + if (!len) + return 0; + + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + ret = check_brk_limits(addr, len); + if (ret) + goto limits_failed; + + ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); + if (ret) + goto munmap_failed; + + vma = mas_prev(&mas, 0); + ret = do_brk_flags(&mas, vma, addr, len, flags); + populate = ((mm->def_flags & VM_LOCKED) != 0); + mmap_write_unlock(mm); + userfaultfd_unmap_complete(mm, &uf); + if (populate && !ret) + mm_populate(addr, len); + return ret; + +munmap_failed: +limits_failed: + mmap_write_unlock(mm); + return ret; +} +EXPORT_SYMBOL(vm_brk_flags); + +int vm_brk(unsigned long addr, unsigned long len) +{ + return vm_brk_flags(addr, len, 0); +} +EXPORT_SYMBOL(vm_brk); + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int count = 0; + + /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); + + mmap_read_lock(mm); + arch_exit_mmap(mm); + + vma = mas_find(&mas, ULONG_MAX); + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_read_unlock(mm); + return; + } + + lru_add_drain(); + flush_cache_mm(mm); + tlb_gather_mmu_fullmm(&tlb, mm); + /* update_hiwater_rss(mm) here? but nobody should be looking */ + /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + mmap_read_unlock(mm); + + /* + * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper + * because the memory has been already freed. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + USER_PGTABLES_CEILING); + tlb_finish_mmu(&tlb); + + /* + * Walk the list again, actually closing and freeing it, with preemption + * enabled, without holding any MM locks besides the unreachable + * mmap_write_lock. + */ + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); + remove_vma(vma); + count++; + cond_resched(); + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + + BUG_ON(count != mm->map_count); + + trace_exit_mmap(mm); + __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); + vm_unacct_memory(nr_accounted); +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap tree. If vm_file is non-NULL + * then i_mmap_rwsem is taken here. + */ +int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +{ + unsigned long charged = vma_pages(vma); + + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + return -ENOMEM; + + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + + /* + * The vm_pgoff of a purely anonymous vma should be irrelevant + * until its first write fault, when page's anon_vma and index + * are set. But now set the vm_pgoff it will almost certainly + * end up with (unless mremap moves it elsewhere before that + * first wfault), so /proc/pid/maps tells a consistent story. + * + * By setting it to reflect the virtual start address of the + * vma, merges and splits can happen in a seamless way, just + * using the existing file pgoff checks and manipulations. + * Similarly in do_mmap and in do_brk_flags. + */ + if (vma_is_anonymous(vma)) { + BUG_ON(vma->anon_vma); + vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; + } + + if (vma_link(mm, vma)) { + vm_unacct_memory(charged); + return -ENOMEM; + } + + return 0; +} + +/* + * Copy the vma structure to a new location in the same mm, + * prior to moving page table entries, to effect an mremap move. + */ +struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + unsigned long addr, unsigned long len, pgoff_t pgoff, + bool *need_rmap_locks) +{ + struct vm_area_struct *vma = *vmap; + unsigned long vma_start = vma->vm_start; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma, *prev; + bool faulted_in_anon_vma = true; + + validate_mm_mt(mm); + /* + * If anonymous vma has not yet been faulted, update new pgoff + * to match new location, to increase its chance of merging. + */ + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { + pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } + + new_vma = find_vma_prev(mm, addr, &prev); + if (new_vma && new_vma->vm_start < addr + len) + return NULL; /* should never get here */ + + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (new_vma) { + /* + * Source vma may have been merged into new_vma + */ + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); + *vmap = vma = new_vma; + } + *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); + } else { + new_vma = vm_area_dup(vma); + if (!new_vma) + goto out; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (vma_dup_policy(vma, new_vma)) + goto out_free_vma; + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + if (vma_link(mm, new_vma)) + goto out_vma_link; + *need_rmap_locks = false; + } + validate_mm_mt(mm); + return new_vma; + +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); +out_free_mempol: + mpol_put(vma_policy(new_vma)); +out_free_vma: + vm_area_free(new_vma); +out: + validate_mm_mt(mm); + return NULL; +} + +/* + * Return true if the calling process may expand its vm space by the passed + * number of pages + */ +bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +{ + if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) + return false; + + if (is_data_mapping(flags) && + mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { + /* Workaround for Valgrind */ + if (rlimit(RLIMIT_DATA) == 0 && + mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) + return true; + + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA), + ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); + + if (!ignore_rlimit_data) + return false; + } + + return true; +} + +void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) +{ + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); + + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; +} + +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static int special_mapping_mremap(struct vm_area_struct *new_vma) +{ + struct vm_special_mapping *sm = new_vma->vm_private_data; + + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) + return -EFAULT; + + if (sm->mremap) + return sm->mremap(sm, new_vma); + + return 0; +} + +static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting special mappings - kernel has expectations over + * the number of pages in mapping. Together with VM_DONTEXPAND + * the size of vma should stay the same over the special mapping's + * lifetime. + */ + return -EINVAL; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .mremap = special_mapping_mremap, + .name = special_mapping_name, + /* vDSO code relies that VVAR can't be accessed remotely */ + .access = NULL, + .may_split = special_mapping_split, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; + +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + pgoff_t pgoff; + struct page **pages; + + if (vma->vm_ops == &legacy_special_mapping_vmops) { + pages = vma->vm_private_data; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vmf->vma, vmf); + + pages = sm->pages; + } + + for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) + pgoff--; + + if (*pages) { + struct page *page = *pages; + get_page(page); + vmf->page = page; + return 0; + } + + return VM_FAULT_SIGBUS; +} + +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, void *priv, + const struct vm_operations_struct *ops) +{ + int ret; + struct vm_area_struct *vma; + + validate_mm_mt(mm); + vma = vm_area_alloc(mm); + if (unlikely(vma == NULL)) + return ERR_PTR(-ENOMEM); + + vma->vm_start = addr; + vma->vm_end = addr + len; + + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + vma->vm_ops = ops; + vma->vm_private_data = priv; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; + + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); + + perf_event_mmap(vma); + + validate_mm_mt(mm); + return vma; + +out: + vm_area_free(vma); + validate_mm_mt(mm); + return ERR_PTR(ret); +} + +bool vma_is_special_mapping(const struct vm_area_struct *vma, + const struct vm_special_mapping *sm) +{ + return vma->vm_private_data == sm && + (vma->vm_ops == &special_mapping_vmops || + vma->vm_ops == &legacy_special_mapping_vmops); +} + +/* + * Called with mm->mmap_lock held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, + &special_mapping_vmops); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, (void *)pages, + &legacy_special_mapping_vmops); + + return PTR_ERR_OR_ZERO(vma); +} + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); + /* + * We can safely modify head.next after taking the + * anon_vma->root->rwsem. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->root->rwsem. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_root.rb_node)) + BUG(); + } +} + +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_lock in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_lock until mm_drop_all_locks() returns. + * + * mmap_lock in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout. It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We take locks in following order, accordingly to comment at beginning + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * + * We can take all locks within these types randomly because the VM code + * doesn't nest them and we protected from parallel mm_take_all_locks() by + * mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + mmap_assert_write_locked(mm); + + mutex_lock(&mm_all_locks_mutex); + + mas_for_each(&mas, vma, ULONG_MAX) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + !is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); + } + + return 0; + +out_unlock: + mm_drop_all_locks(mm); + return -EINTR; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->rb_root will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->root->rwsem. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->root->rb_root.rb_root.rb_node)) + BUG(); + anon_vma_unlock_write(anon_vma); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + i_mmap_unlock_write(mapping); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_lock cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); + + mmap_assert_write_locked(mm); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + mas_for_each(&mas, vma, ULONG_MAX) { + if (vma->anon_vma) + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} + +/* + * initialise the percpu counter for VM + */ +void __init mmap_init(void) +{ + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); + VM_BUG_ON(ret); +} + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +subsys_initcall(init_user_reserve); + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +subsys_initcall(init_admin_reserve); + +/* + * Reinititalise user and admin reserves if memory is added or removed. + * + * The default user reserve max is 128MB, and the default max for the + * admin reserve is 8MB. These are usually, but not always, enough to + * enable recovery from a memory hogging process using login/sshd, a shell, + * and tools like top. It may make sense to increase or even disable the + * reserve depending on the existence of swap or variations in the recovery + * tools. So, the admin may have changed them. + * + * If memory is added and the reserves have been eliminated or increased above + * the default max, then we'll trust the admin. + * + * If memory is removed and there isn't enough free memory, then we + * need to reset the reserves. + * + * Otherwise keep the reserve set by the admin. + */ +static int reserve_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + unsigned long tmp, free_kbytes; + + switch (action) { + case MEM_ONLINE: + /* Default max is 128MB. Leave alone if modified by operator. */ + tmp = sysctl_user_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 17)) + init_user_reserve(); + + /* Default max is 8MB. Leave alone if modified by operator. */ + tmp = sysctl_admin_reserve_kbytes; + if (0 < tmp && tmp < (1UL << 13)) + init_admin_reserve(); + + break; + case MEM_OFFLINE: + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + if (sysctl_user_reserve_kbytes > free_kbytes) { + init_user_reserve(); + pr_info("vm.user_reserve_kbytes reset to %lu\n", + sysctl_user_reserve_kbytes); + } + + if (sysctl_admin_reserve_kbytes > free_kbytes) { + init_admin_reserve(); + pr_info("vm.admin_reserve_kbytes reset to %lu\n", + sysctl_admin_reserve_kbytes); + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block reserve_mem_nb = { + .notifier_call = reserve_mem_notifier, +}; + +static int __meminit init_reserve_notifier(void) +{ + if (register_hotmemory_notifier(&reserve_mem_nb)) + pr_err("Failed registering memory add/remove notifier for admin reserve\n"); + + return 0; +} +subsys_initcall(init_reserve_notifier); diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c new file mode 100644 index 000000000..1854850b4 --- /dev/null +++ b/mm/mmap_lock.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +#define CREATE_TRACE_POINTS +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); + +#ifdef CONFIG_MEMCG + +/* + * Our various events all share the same buffer (because we don't want or need + * to allocate a set of buffers *per event type*), so we need to protect against + * concurrent _reg() and _unreg() calls, and count how many _reg() calls have + * been made. + */ +static DEFINE_MUTEX(reg_lock); +static int reg_refcount; /* Protected by reg_lock. */ + +/* + * Size of the buffer for memcg path names. Ignoring stack trace support, + * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. + */ +#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL + +/* + * How many contexts our trace events might be called in: normal, softirq, irq, + * and NMI. + */ +#define CONTEXT_COUNT 4 + +struct memcg_path { + local_lock_t lock; + char __rcu *buf; + local_t buf_idx; +}; +static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { + .lock = INIT_LOCAL_LOCK(lock), + .buf_idx = LOCAL_INIT(0), +}; + +static char **tmp_bufs; + +/* Called with reg_lock held. */ +static void free_memcg_path_bufs(void) +{ + struct memcg_path *memcg_path; + int cpu; + char **old = tmp_bufs; + + for_each_possible_cpu(cpu) { + memcg_path = per_cpu_ptr(&memcg_paths, cpu); + *(old++) = rcu_dereference_protected(memcg_path->buf, + lockdep_is_held(®_lock)); + rcu_assign_pointer(memcg_path->buf, NULL); + } + + /* Wait for inflight memcg_path_buf users to finish. */ + synchronize_rcu(); + + old = tmp_bufs; + for_each_possible_cpu(cpu) { + kfree(*(old++)); + } + + kfree(tmp_bufs); + tmp_bufs = NULL; +} + +int trace_mmap_lock_reg(void) +{ + int cpu; + char *new; + + mutex_lock(®_lock); + + /* If the refcount is going 0->1, proceed with allocating buffers. */ + if (reg_refcount++) + goto out; + + tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), + GFP_KERNEL); + if (tmp_bufs == NULL) + goto out_fail; + + for_each_possible_cpu(cpu) { + new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); + if (new == NULL) + goto out_fail_free; + rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); + /* Don't need to wait for inflights, they'd have gotten NULL. */ + } + +out: + mutex_unlock(®_lock); + return 0; + +out_fail_free: + free_memcg_path_bufs(); +out_fail: + /* Since we failed, undo the earlier ref increment. */ + --reg_refcount; + + mutex_unlock(®_lock); + return -ENOMEM; +} + +void trace_mmap_lock_unreg(void) +{ + mutex_lock(®_lock); + + /* If the refcount is going 1->0, proceed with freeing buffers. */ + if (--reg_refcount) + goto out; + + free_memcg_path_bufs(); + +out: + mutex_unlock(®_lock); +} + +static inline char *get_memcg_path_buf(void) +{ + struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); + char *buf; + int idx; + + rcu_read_lock(); + buf = rcu_dereference(memcg_path->buf); + if (buf == NULL) { + rcu_read_unlock(); + return NULL; + } + idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - + MEMCG_PATH_BUF_SIZE; + return &buf[idx]; +} + +static inline void put_memcg_path_buf(void) +{ + local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); + rcu_read_unlock(); +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + const char *memcg_path; \ + local_lock(&memcg_paths.lock); \ + memcg_path = get_mm_memcg_path(mm); \ + trace_mmap_lock_##type(mm, \ + memcg_path != NULL ? memcg_path : "", \ + ##__VA_ARGS__); \ + if (likely(memcg_path != NULL)) \ + put_memcg_path_buf(); \ + local_unlock(&memcg_paths.lock); \ + } while (0) + +#else /* !CONFIG_MEMCG */ + +int trace_mmap_lock_reg(void) +{ + return 0; +} + +void trace_mmap_lock_unreg(void) +{ +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) + +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_TRACING +#ifdef CONFIG_MEMCG +/* + * Write the given mm_struct's memcg path to a percpu buffer, and return a + * pointer to it. If the path cannot be determined, or no buffer was available + * (because the trace event is being unregistered), NULL is returned. + * + * Note: buffers are allocated per-cpu to avoid locking, so preemption must be + * disabled by the caller before calling us, and re-enabled only after the + * caller is done with the pointer. + * + * The caller must call put_memcg_path_buf() once the buffer is no longer + * needed. This must be done while preemption is still disabled. + */ +static const char *get_mm_memcg_path(struct mm_struct *mm) +{ + char *buf = NULL; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + + if (memcg == NULL) + goto out; + if (unlikely(memcg->css.cgroup == NULL)) + goto out_put; + + buf = get_memcg_path_buf(); + if (buf == NULL) + goto out_put; + + cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); + +out_put: + css_put(&memcg->css); +out: + return buf; +} + +#endif /* CONFIG_MEMCG */ + +/* + * Trace calls must be in a separate file, as otherwise there's a circular + * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. + */ + +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success) +{ + TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); + +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(released, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_released); +#endif /* CONFIG_TRACING */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c new file mode 100644 index 000000000..3a2c3f8ca --- /dev/null +++ b/mm/mmu_gather.c @@ -0,0 +1,367 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifndef CONFIG_MMU_GATHER_NO_GATHER + +static bool tlb_next_batch(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + batch = tlb->active; + if (batch->next) { + tlb->active = batch->next; + return true; + } + + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return false; + + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); + if (!batch) + return false; + + tlb->batch_count++; + batch->next = NULL; + batch->nr = 0; + batch->max = MAX_GATHER_BATCH; + + tlb->active->next = batch; + tlb->active = batch; + + return true; +} + +static void tlb_batch_pages_flush(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch; + + for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { + struct page **pages = batch->pages; + + do { + /* + * limit free batch count when PAGE_SIZE > 4K + */ + unsigned int nr = min(512U, batch->nr); + + free_pages_and_swap_cache(pages, nr); + pages += nr; + batch->nr -= nr; + + cond_resched(); + } while (batch->nr); + } + tlb->active = &tlb->local; +} + +static void tlb_batch_list_free(struct mmu_gather *tlb) +{ + struct mmu_gather_batch *batch, *next; + + for (batch = tlb->local.next; batch; batch = next) { + next = batch->next; + free_pages((unsigned long)batch, 0); + } + tlb->local.next = NULL; +} + +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +{ + struct mmu_gather_batch *batch; + + VM_BUG_ON(!tlb->end); + +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE + VM_WARN_ON(tlb->page_size != page_size); +#endif + + batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; + if (batch->nr == batch->max) { + if (!tlb_next_batch(tlb)) + return true; + batch = tlb->active; + } + VM_BUG_ON_PAGE(batch->nr > batch->max, page); + + return false; +} + +#endif /* MMU_GATHER_NO_GATHER */ + +#ifdef CONFIG_MMU_GATHER_TABLE_FREE + +static void __tlb_remove_table_free(struct mmu_table_batch *batch) +{ + int i; + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE + +/* + * Semi RCU freeing of the page directories. + * + * This is needed by some architectures to implement software pagetable walkers. + * + * gup_fast() and other software pagetable walkers do a lockless page-table + * walk and therefore needs some synchronization with the freeing of the page + * directories. The chosen means to accomplish that is by disabling IRQs over + * the walk. + * + * Architectures that use IPIs to flush TLBs will then automagically DTRT, + * since we unlink the page, flush TLBs, free the page. Since the disabling of + * IRQs delays the completion of the TLB flush we can never observe an already + * freed page. + * + * Architectures that do not have this (PPC) need to delay the freeing by some + * other means, this is that means. + * + * What we do is batch the freed directory pages (tables) and RCU free them. + * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling + * holds off grace periods. + * + * However, in order to batch these pages we need to allocate storage, this + * allocation is deep inside the MM code and can thus easily fail on memory + * pressure. To guarantee progress we fall back to single table freeing, see + * the implementation of tlb_remove_table_one(). + * + */ + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +void tlb_remove_table_sync_one(void) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely on + * IRQ disabling. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu)); +} + +static void tlb_remove_table_free(struct mmu_table_batch *batch) +{ + call_rcu(&batch->rcu, tlb_remove_table_rcu); +} + +#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ + +static void tlb_remove_table_free(struct mmu_table_batch *batch) +{ + __tlb_remove_table_free(batch); +} + +#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ + +/* + * If we want tlb_remove_table() to imply TLB invalidates. + */ +static inline void tlb_table_invalidate(struct mmu_gather *tlb) +{ + if (tlb_needs_table_invalidate()) { + /* + * Invalidate page-table caches used by hardware walkers. Then + * we still need to RCU-sched wait while freeing the pages + * because software walkers can still be in-flight. + */ + tlb_flush_mmu_tlbonly(tlb); + } +} + +static void tlb_remove_table_one(void *table) +{ + tlb_remove_table_sync_one(); + __tlb_remove_table(table); +} + +static void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + tlb_table_invalidate(tlb); + tlb_remove_table_free(*batch); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch == NULL) { + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + tlb_table_invalidate(tlb); + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_table_flush(tlb); +} + +static inline void tlb_table_init(struct mmu_gather *tlb) +{ + tlb->batch = NULL; +} + +#else /* !CONFIG_MMU_GATHER_TABLE_FREE */ + +static inline void tlb_table_flush(struct mmu_gather *tlb) { } +static inline void tlb_table_init(struct mmu_gather *tlb) { } + +#endif /* CONFIG_MMU_GATHER_TABLE_FREE */ + +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ + tlb_table_flush(tlb); +#ifndef CONFIG_MMU_GATHER_NO_GATHER + tlb_batch_pages_flush(tlb); +#endif +} + +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + +static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + bool fullmm) +{ + /* + * struct mmu_gather contains 7 1-bit fields packed into a 32-bit + * unsigned int value. The remaining 25 bits remain uninitialized + * and are never used, but KMSAN updates the origin for them in + * zap_pXX_range() in mm/memory.c, thus creating very long origin + * chains. This is technically correct, but consumes too much memory. + * Unpoisoning the whole structure will prevent creating such chains. + */ + kmsan_unpoison_memory(tlb, sizeof(*tlb)); + tlb->mm = mm; + tlb->fullmm = fullmm; + +#ifndef CONFIG_MMU_GATHER_NO_GATHER + tlb->need_flush_all = 0; + tlb->local.next = NULL; + tlb->local.nr = 0; + tlb->local.max = ARRAY_SIZE(tlb->__pages); + tlb->active = &tlb->local; + tlb->batch_count = 0; +#endif + + tlb_table_init(tlb); +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE + tlb->page_size = 0; +#endif + + __tlb_reset_range(tlb); + inc_tlb_flush_pending(tlb->mm); +} + +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) +{ + __tlb_gather_mmu(tlb, mm, false); +} + +/** + * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * + * In this case, @mm is without users and we're going to destroy the + * full address space (exit/execve). + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. + */ +void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) +{ + __tlb_gather_mmu(tlb, mm, true); +} + +/** + * tlb_finish_mmu - finish an mmu_gather structure + * @tlb: the mmu_gather structure to finish + * + * Called at the end of the shootdown operation to free up any resources that + * were required. + */ +void tlb_finish_mmu(struct mmu_gather *tlb) +{ + /* + * If there are parallel threads are doing PTE changes on same range + * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB + * flush by batching, one thread may end up seeing inconsistent PTEs + * and result in having stale TLB entries. So flush TLB forcefully + * if we detect parallel PTE batching threads. + * + * However, some syscalls, e.g. munmap(), may free page tables, this + * needs force flush everything in the given range. Otherwise this + * may result in having stale TLB entries for some architectures, + * e.g. aarch64, that could specify flush what level TLB. + */ + if (mm_tlb_flush_nested(tlb->mm)) { + /* + * The aarch64 yields better performance with fullmm by + * avoiding multiple CPUs spamming TLBI messages at the + * same time. + * + * On x86 non-fullmm doesn't yield significant difference + * against fullmm. + */ + tlb->fullmm = 1; + __tlb_reset_range(tlb); + tlb->freed_tables = 1; + } + + tlb_flush_mmu(tlb); + +#ifndef CONFIG_MMU_GATHER_NO_GATHER + tlb_batch_list_free(tlb); +#endif + dec_tlb_flush_pending(tlb->mm); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000..f45ff1b76 --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,1132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* global SRCU for all MMs */ +DEFINE_STATIC_SRCU(srcu); + +#ifdef CONFIG_LOCKDEP +struct lockdep_map __mmu_notifier_invalidate_range_start_map = { + .name = "mmu_notifier_invalidate_range_start" +}; +#endif + +/* + * The mmu_notifier_subscriptions structure is allocated and installed in + * mm->notifier_subscriptions inside the mm_take_all_locks() protected + * critical section and it's released only when mm_count reaches zero + * in mmdrop(). + */ +struct mmu_notifier_subscriptions { + /* all mmu notifiers registered in this mm are queued in this list */ + struct hlist_head list; + bool has_itree; + /* to serialize the list modifications and hlist_unhashed */ + spinlock_t lock; + unsigned long invalidate_seq; + unsigned long active_invalidate_ranges; + struct rb_root_cached itree; + wait_queue_head_t wq; + struct hlist_head deferred_list; +}; + +/* + * This is a collision-retry read-side/write-side 'lock', a lot like a + * seqcount, however this allows multiple write-sides to hold it at + * once. Conceptually the write side is protecting the values of the PTEs in + * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any + * writer exists. + * + * Note that the core mm creates nested invalidate_range_start()/end() regions + * within the same thread, and runs invalidate_range_start()/end() in parallel + * on multiple CPUs. This is designed to not reduce concurrency or block + * progress on the mm side. + * + * As a secondary function, holding the full write side also serves to prevent + * writers for the itree, this is an optimization to avoid extra locking + * during invalidate_range_start/end notifiers. + * + * The write side has two states, fully excluded: + * - mm->active_invalidate_ranges != 0 + * - subscriptions->invalidate_seq & 1 == True (odd) + * - some range on the mm_struct is being invalidated + * - the itree is not allowed to change + * + * And partially excluded: + * - mm->active_invalidate_ranges != 0 + * - subscriptions->invalidate_seq & 1 == False (even) + * - some range on the mm_struct is being invalidated + * - the itree is allowed to change + * + * Operations on notifier_subscriptions->invalidate_seq (under spinlock): + * seq |= 1 # Begin writing + * seq++ # Release the writing state + * seq & 1 # True if a writer exists + * + * The later state avoids some expensive work on inv_end in the common case of + * no mmu_interval_notifier monitoring the VA. + */ +static bool +mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions) +{ + lockdep_assert_held(&subscriptions->lock); + return subscriptions->invalidate_seq & 1; +} + +static struct mmu_interval_notifier * +mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions, + const struct mmu_notifier_range *range, + unsigned long *seq) +{ + struct interval_tree_node *node; + struct mmu_interval_notifier *res = NULL; + + spin_lock(&subscriptions->lock); + subscriptions->active_invalidate_ranges++; + node = interval_tree_iter_first(&subscriptions->itree, range->start, + range->end - 1); + if (node) { + subscriptions->invalidate_seq |= 1; + res = container_of(node, struct mmu_interval_notifier, + interval_tree); + } + + *seq = subscriptions->invalidate_seq; + spin_unlock(&subscriptions->lock); + return res; +} + +static struct mmu_interval_notifier * +mn_itree_inv_next(struct mmu_interval_notifier *interval_sub, + const struct mmu_notifier_range *range) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_next(&interval_sub->interval_tree, + range->start, range->end - 1); + if (!node) + return NULL; + return container_of(node, struct mmu_interval_notifier, interval_tree); +} + +static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions) +{ + struct mmu_interval_notifier *interval_sub; + struct hlist_node *next; + + spin_lock(&subscriptions->lock); + if (--subscriptions->active_invalidate_ranges || + !mn_itree_is_invalidating(subscriptions)) { + spin_unlock(&subscriptions->lock); + return; + } + + /* Make invalidate_seq even */ + subscriptions->invalidate_seq++; + + /* + * The inv_end incorporates a deferred mechanism like rtnl_unlock(). + * Adds and removes are queued until the final inv_end happens then + * they are progressed. This arrangement for tree updates is used to + * avoid using a blocking lock during invalidate_range_start. + */ + hlist_for_each_entry_safe(interval_sub, next, + &subscriptions->deferred_list, + deferred_item) { + if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); + else + interval_tree_remove(&interval_sub->interval_tree, + &subscriptions->itree); + hlist_del(&interval_sub->deferred_item); + } + spin_unlock(&subscriptions->lock); + + wake_up_all(&subscriptions->wq); +} + +/** + * mmu_interval_read_begin - Begin a read side critical section against a VA + * range + * @interval_sub: The interval subscription + * + * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a + * collision-retry scheme similar to seqcount for the VA range under + * subscription. If the mm invokes invalidation during the critical section + * then mmu_interval_read_retry() will return true. + * + * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs + * require a blocking context. The critical region formed by this can sleep, + * and the required 'user_lock' can also be a sleeping lock. + * + * The caller is required to provide a 'user_lock' to serialize both teardown + * and setup. + * + * The return value should be passed to mmu_interval_read_retry(). + */ +unsigned long +mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) +{ + struct mmu_notifier_subscriptions *subscriptions = + interval_sub->mm->notifier_subscriptions; + unsigned long seq; + bool is_invalidating; + + /* + * If the subscription has a different seq value under the user_lock + * than we started with then it has collided. + * + * If the subscription currently has the same seq value as the + * subscriptions seq, then it is currently between + * invalidate_start/end and is colliding. + * + * The locking looks broadly like this: + * mn_tree_invalidate_start(): mmu_interval_read_begin(): + * spin_lock + * seq = READ_ONCE(interval_sub->invalidate_seq); + * seq == subs->invalidate_seq + * spin_unlock + * spin_lock + * seq = ++subscriptions->invalidate_seq + * spin_unlock + * op->invalidate_range(): + * user_lock + * mmu_interval_set_seq() + * interval_sub->invalidate_seq = seq + * user_unlock + * + * [Required: mmu_interval_read_retry() == true] + * + * mn_itree_inv_end(): + * spin_lock + * seq = ++subscriptions->invalidate_seq + * spin_unlock + * + * user_lock + * mmu_interval_read_retry(): + * interval_sub->invalidate_seq != seq + * user_unlock + * + * Barriers are not needed here as any races here are closed by an + * eventual mmu_interval_read_retry(), which provides a barrier via the + * user_lock. + */ + spin_lock(&subscriptions->lock); + /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ + seq = READ_ONCE(interval_sub->invalidate_seq); + is_invalidating = seq == subscriptions->invalidate_seq; + spin_unlock(&subscriptions->lock); + + /* + * interval_sub->invalidate_seq must always be set to an odd value via + * mmu_interval_set_seq() using the provided cur_seq from + * mn_itree_inv_start_range(). This ensures that if seq does wrap we + * will always clear the below sleep in some reasonable time as + * subscriptions->invalidate_seq is even in the idle state. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (is_invalidating) + wait_event(subscriptions->wq, + READ_ONCE(subscriptions->invalidate_seq) != seq); + + /* + * Notice that mmu_interval_read_retry() can already be true at this + * point, avoiding loops here allows the caller to provide a global + * time bound. + */ + + return seq; +} +EXPORT_SYMBOL_GPL(mmu_interval_read_begin); + +static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, + struct mm_struct *mm) +{ + struct mmu_notifier_range range = { + .flags = MMU_NOTIFIER_RANGE_BLOCKABLE, + .event = MMU_NOTIFY_RELEASE, + .mm = mm, + .start = 0, + .end = ULONG_MAX, + }; + struct mmu_interval_notifier *interval_sub; + unsigned long cur_seq; + bool ret; + + for (interval_sub = + mn_itree_inv_start_range(subscriptions, &range, &cur_seq); + interval_sub; + interval_sub = mn_itree_inv_next(interval_sub, &range)) { + ret = interval_sub->ops->invalidate(interval_sub, &range, + cur_seq); + WARN_ON(!ret); + } + + mn_itree_inv_end(subscriptions); +} + +/* + * This function can't run concurrently against mmu_notifier_register + * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap + * runs with mm_users == 0. Other tasks may still invoke mmu notifiers + * in parallel despite there being no task using this mm any more, + * through the vmas outside of the exit_mmap context, such as with + * vmtruncate. This serializes against mmu_notifier_unregister with + * the notifier_subscriptions->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions + * can't go away from under us as exit_mmap holds an mm_count pin + * itself. + */ +static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, + struct mm_struct *mm) +{ + struct mmu_notifier *subscription; + int id; + + /* + * SRCU here will block mmu_notifier_unregister until + * ->release returns. + */ + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) + /* + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. + */ + if (subscription->ops->release) + subscription->ops->release(subscription, mm); + + spin_lock(&subscriptions->lock); + while (unlikely(!hlist_empty(&subscriptions->list))) { + subscription = hlist_entry(subscriptions->list.first, + struct mmu_notifier, hlist); + /* + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. + */ + hlist_del_init_rcu(&subscription->hlist); + } + spin_unlock(&subscriptions->lock); + srcu_read_unlock(&srcu, id); + + /* + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. + * + * The notifier_subscriptions can't go away from under us because + * one mm_count is held by exit_mmap. + */ + synchronize_srcu(&srcu); +} + +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; + + if (subscriptions->has_itree) + mn_itree_release(subscriptions, mm); + + if (!hlist_empty(&subscriptions->list)) + mn_hlist_release(subscriptions, mm); +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct mmu_notifier *subscription; + int young = 0, id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + if (subscription->ops->clear_flush_young) + young |= subscription->ops->clear_flush_young( + subscription, mm, start, end); + } + srcu_read_unlock(&srcu, id); + + return young; +} + +int __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct mmu_notifier *subscription; + int young = 0, id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + if (subscription->ops->clear_young) + young |= subscription->ops->clear_young(subscription, + mm, start, end); + } + srcu_read_unlock(&srcu, id); + + return young; +} + +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *subscription; + int young = 0, id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + if (subscription->ops->test_young) { + young = subscription->ops->test_young(subscription, mm, + address); + if (young) + break; + } + } + srcu_read_unlock(&srcu, id); + + return young; +} + +void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, + pte_t pte) +{ + struct mmu_notifier *subscription; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + if (subscription->ops->change_pte) + subscription->ops->change_pte(subscription, mm, address, + pte); + } + srcu_read_unlock(&srcu, id); +} + +static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, + const struct mmu_notifier_range *range) +{ + struct mmu_interval_notifier *interval_sub; + unsigned long cur_seq; + + for (interval_sub = + mn_itree_inv_start_range(subscriptions, range, &cur_seq); + interval_sub; + interval_sub = mn_itree_inv_next(interval_sub, range)) { + bool ret; + + ret = interval_sub->ops->invalidate(interval_sub, range, + cur_seq); + if (!ret) { + if (WARN_ON(mmu_notifier_range_blockable(range))) + continue; + goto out_would_block; + } + } + return 0; + +out_would_block: + /* + * On -EAGAIN the non-blocking caller is not allowed to call + * invalidate_range_end() + */ + mn_itree_inv_end(subscriptions); + return -EAGAIN; +} + +static int mn_hlist_invalidate_range_start( + struct mmu_notifier_subscriptions *subscriptions, + struct mmu_notifier_range *range) +{ + struct mmu_notifier *subscription; + int ret = 0; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + const struct mmu_notifier_ops *ops = subscription->ops; + + if (ops->invalidate_range_start) { + int _ret; + + if (!mmu_notifier_range_blockable(range)) + non_block_start(); + _ret = ops->invalidate_range_start(subscription, range); + if (!mmu_notifier_range_blockable(range)) + non_block_end(); + if (_ret) { + pr_info("%pS callback failed with %d in %sblockable context.\n", + ops->invalidate_range_start, _ret, + !mmu_notifier_range_blockable(range) ? + "non-" : + ""); + WARN_ON(mmu_notifier_range_blockable(range) || + _ret != -EAGAIN); + /* + * We call all the notifiers on any EAGAIN, + * there is no way for a notifier to know if + * its start method failed, thus a start that + * does EAGAIN can't also do end. + */ + WARN_ON(ops->invalidate_range_end); + ret = _ret; + } + } + } + + if (ret) { + /* + * Must be non-blocking to get here. If there are multiple + * notifiers and one or more failed start, any that succeeded + * start are expecting their end to be called. Do so now. + */ + hlist_for_each_entry_rcu(subscription, &subscriptions->list, + hlist, srcu_read_lock_held(&srcu)) { + if (!subscription->ops->invalidate_range_end) + continue; + + subscription->ops->invalidate_range_end(subscription, + range); + } + } + srcu_read_unlock(&srcu, id); + + return ret; +} + +int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) +{ + struct mmu_notifier_subscriptions *subscriptions = + range->mm->notifier_subscriptions; + int ret; + + if (subscriptions->has_itree) { + ret = mn_itree_invalidate(subscriptions, range); + if (ret) + return ret; + } + if (!hlist_empty(&subscriptions->list)) + return mn_hlist_invalidate_range_start(subscriptions, range); + return 0; +} + +static void +mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, + struct mmu_notifier_range *range, bool only_end) +{ + struct mmu_notifier *subscription; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + /* + * Call invalidate_range here too to avoid the need for the + * subsystem of having to register an invalidate_range_end + * call-back when there is invalidate_range already. Usually a + * subsystem registers either invalidate_range_start()/end() or + * invalidate_range(), so this will be no additional overhead + * (besides the pointer check). + * + * We skip call to invalidate_range() if we know it is safe ie + * call site use mmu_notifier_invalidate_range_only_end() which + * is safe to do when we know that a call to invalidate_range() + * already happen under page table lock. + */ + if (!only_end && subscription->ops->invalidate_range) + subscription->ops->invalidate_range(subscription, + range->mm, + range->start, + range->end); + if (subscription->ops->invalidate_range_end) { + if (!mmu_notifier_range_blockable(range)) + non_block_start(); + subscription->ops->invalidate_range_end(subscription, + range); + if (!mmu_notifier_range_blockable(range)) + non_block_end(); + } + } + srcu_read_unlock(&srcu, id); +} + +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, + bool only_end) +{ + struct mmu_notifier_subscriptions *subscriptions = + range->mm->notifier_subscriptions; + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + if (subscriptions->has_itree) + mn_itree_inv_end(subscriptions); + + if (!hlist_empty(&subscriptions->list)) + mn_hlist_invalidate_end(subscriptions, range, only_end); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); +} + +void __mmu_notifier_invalidate_range(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *subscription; + int id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + srcu_read_lock_held(&srcu)) { + if (subscription->ops->invalidate_range) + subscription->ops->invalidate_range(subscription, mm, + start, end); + } + srcu_read_unlock(&srcu, id); +} + +/* + * Same as mmu_notifier_register but here the caller must hold the mmap_lock in + * write mode. A NULL mn signals the notifier is being registered for itree + * mode. + */ +int __mmu_notifier_register(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + struct mmu_notifier_subscriptions *subscriptions = NULL; + int ret; + + mmap_assert_write_locked(mm); + BUG_ON(atomic_read(&mm->mm_users) <= 0); + + if (!mm->notifier_subscriptions) { + /* + * kmalloc cannot be called under mm_take_all_locks(), but we + * know that mm->notifier_subscriptions can't change while we + * hold the write side of the mmap_lock. + */ + subscriptions = kzalloc( + sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL); + if (!subscriptions) + return -ENOMEM; + + INIT_HLIST_HEAD(&subscriptions->list); + spin_lock_init(&subscriptions->lock); + subscriptions->invalidate_seq = 2; + subscriptions->itree = RB_ROOT_CACHED; + init_waitqueue_head(&subscriptions->wq); + INIT_HLIST_HEAD(&subscriptions->deferred_list); + } + + ret = mm_take_all_locks(mm); + if (unlikely(ret)) + goto out_clean; + + /* + * Serialize the update against mmu_notifier_unregister. A + * side note: mmu_notifier_release can't run concurrently with + * us because we hold the mm_users pin (either implicitly as + * current->mm or explicitly with get_task_mm() or similar). + * We can't race against any other mmu notifier method either + * thanks to mm_take_all_locks(). + * + * release semantics on the initialization of the + * mmu_notifier_subscriptions's contents are provided for unlocked + * readers. acquire can only be used while holding the mmgrab or + * mmget, and is safe because once created the + * mmu_notifier_subscriptions is not freed until the mm is destroyed. + * As above, users holding the mmap_lock or one of the + * mm_take_all_locks() do not need to use acquire semantics. + */ + if (subscriptions) + smp_store_release(&mm->notifier_subscriptions, subscriptions); + + if (subscription) { + /* Pairs with the mmdrop in mmu_notifier_unregister_* */ + mmgrab(mm); + subscription->mm = mm; + subscription->users = 1; + + spin_lock(&mm->notifier_subscriptions->lock); + hlist_add_head_rcu(&subscription->hlist, + &mm->notifier_subscriptions->list); + spin_unlock(&mm->notifier_subscriptions->lock); + } else + mm->notifier_subscriptions->has_itree = true; + + mm_drop_all_locks(mm); + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return 0; + +out_clean: + kfree(subscriptions); + return ret; +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); + +/** + * mmu_notifier_register - Register a notifier on a mm + * @subscription: The notifier to attach + * @mm: The mm to attach the notifier to + * + * Must not hold mmap_lock nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. + * + * mmu_notifier_unregister() or mmu_notifier_put() must be always called to + * unregister the notifier. + * + * While the caller has a mmu_notifier get the subscription->mm pointer will remain + * valid, and can be converted to an active mm pointer via mmget_not_zero(). + */ +int mmu_notifier_register(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + int ret; + + mmap_write_lock(mm); + ret = __mmu_notifier_register(subscription, mm); + mmap_write_unlock(mm); + return ret; +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +static struct mmu_notifier * +find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) +{ + struct mmu_notifier *subscription; + + spin_lock(&mm->notifier_subscriptions->lock); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist, + lockdep_is_held(&mm->notifier_subscriptions->lock)) { + if (subscription->ops != ops) + continue; + + if (likely(subscription->users != UINT_MAX)) + subscription->users++; + else + subscription = ERR_PTR(-EOVERFLOW); + spin_unlock(&mm->notifier_subscriptions->lock); + return subscription; + } + spin_unlock(&mm->notifier_subscriptions->lock); + return NULL; +} + +/** + * mmu_notifier_get_locked - Return the single struct mmu_notifier for + * the mm & ops + * @ops: The operations struct being subscribe with + * @mm : The mm to attach notifiers too + * + * This function either allocates a new mmu_notifier via + * ops->alloc_notifier(), or returns an already existing notifier on the + * list. The value of the ops pointer is used to determine when two notifiers + * are the same. + * + * Each call to mmu_notifier_get() must be paired with a call to + * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock. + * + * While the caller has a mmu_notifier get the mm pointer will remain valid, + * and can be converted to an active mm pointer via mmget_not_zero(). + */ +struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, + struct mm_struct *mm) +{ + struct mmu_notifier *subscription; + int ret; + + mmap_assert_write_locked(mm); + + if (mm->notifier_subscriptions) { + subscription = find_get_mmu_notifier(mm, ops); + if (subscription) + return subscription; + } + + subscription = ops->alloc_notifier(mm); + if (IS_ERR(subscription)) + return subscription; + subscription->ops = ops; + ret = __mmu_notifier_register(subscription, mm); + if (ret) + goto out_free; + return subscription; +out_free: + subscription->ops->free_notifier(subscription); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); + +/* this is called after the last mmu_notifier_unregister() returned */ +void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm) +{ + BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list)); + kfree(mm->notifier_subscriptions); + mm->notifier_subscriptions = LIST_POISON1; /* debug */ +} + +/* + * This releases the mm_count pin automatically and frees the mm + * structure if it was the last user of it. It serializes against + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before + * calling mmu_notifier_unregister. ->release or any other notifier + * method may be invoked concurrently with mmu_notifier_unregister, + * and only after mmu_notifier_unregister returned we're guaranteed + * that ->release or any other method can't run anymore. + */ +void mmu_notifier_unregister(struct mmu_notifier *subscription, + struct mm_struct *mm) +{ + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + if (!hlist_unhashed(&subscription->hlist)) { + /* + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. + */ + int id; + + id = srcu_read_lock(&srcu); + /* + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. + */ + if (subscription->ops->release) + subscription->ops->release(subscription, mm); + srcu_read_unlock(&srcu, id); + + spin_lock(&mm->notifier_subscriptions->lock); + /* + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. + */ + hlist_del_init_rcu(&subscription->hlist); + spin_unlock(&mm->notifier_subscriptions->lock); + } + + /* + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_release instead of us. + */ + synchronize_srcu(&srcu); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +static void mmu_notifier_free_rcu(struct rcu_head *rcu) +{ + struct mmu_notifier *subscription = + container_of(rcu, struct mmu_notifier, rcu); + struct mm_struct *mm = subscription->mm; + + subscription->ops->free_notifier(subscription); + /* Pairs with the get in __mmu_notifier_register() */ + mmdrop(mm); +} + +/** + * mmu_notifier_put - Release the reference on the notifier + * @subscription: The notifier to act on + * + * This function must be paired with each mmu_notifier_get(), it releases the + * reference obtained by the get. If this is the last reference then process + * to free the notifier will be run asynchronously. + * + * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release + * when the mm_struct is destroyed. Instead free_notifier is always called to + * release any resources held by the user. + * + * As ops->release is not guaranteed to be called, the user must ensure that + * all sptes are dropped, and no new sptes can be established before + * mmu_notifier_put() is called. + * + * This function can be called from the ops->release callback, however the + * caller must still ensure it is called pairwise with mmu_notifier_get(). + * + * Modules calling this function must call mmu_notifier_synchronize() in + * their __exit functions to ensure the async work is completed. + */ +void mmu_notifier_put(struct mmu_notifier *subscription) +{ + struct mm_struct *mm = subscription->mm; + + spin_lock(&mm->notifier_subscriptions->lock); + if (WARN_ON(!subscription->users) || --subscription->users) + goto out_unlock; + hlist_del_init_rcu(&subscription->hlist); + spin_unlock(&mm->notifier_subscriptions->lock); + + call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu); + return; + +out_unlock: + spin_unlock(&mm->notifier_subscriptions->lock); +} +EXPORT_SYMBOL_GPL(mmu_notifier_put); + +static int __mmu_interval_notifier_insert( + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, + struct mmu_notifier_subscriptions *subscriptions, unsigned long start, + unsigned long length, const struct mmu_interval_notifier_ops *ops) +{ + interval_sub->mm = mm; + interval_sub->ops = ops; + RB_CLEAR_NODE(&interval_sub->interval_tree.rb); + interval_sub->interval_tree.start = start; + /* + * Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval. + */ + if (length == 0 || + check_add_overflow(start, length - 1, + &interval_sub->interval_tree.last)) + return -EOVERFLOW; + + /* Must call with a mmget() held */ + if (WARN_ON(atomic_read(&mm->mm_users) <= 0)) + return -EINVAL; + + /* pairs with mmdrop in mmu_interval_notifier_remove() */ + mmgrab(mm); + + /* + * If some invalidate_range_start/end region is going on in parallel + * we don't know what VA ranges are affected, so we must assume this + * new range is included. + * + * If the itree is invalidating then we are not allowed to change + * it. Retrying until invalidation is done is tricky due to the + * possibility for live lock, instead defer the add to + * mn_itree_inv_end() so this algorithm is deterministic. + * + * In all cases the value for the interval_sub->invalidate_seq should be + * odd, see mmu_interval_read_begin() + */ + spin_lock(&subscriptions->lock); + if (subscriptions->active_invalidate_ranges) { + if (mn_itree_is_invalidating(subscriptions)) + hlist_add_head(&interval_sub->deferred_item, + &subscriptions->deferred_list); + else { + subscriptions->invalidate_seq |= 1; + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); + } + interval_sub->invalidate_seq = subscriptions->invalidate_seq; + } else { + WARN_ON(mn_itree_is_invalidating(subscriptions)); + /* + * The starting seq for a subscription not under invalidation + * should be odd, not equal to the current invalidate_seq and + * invalidate_seq should not 'wrap' to the new seq any time + * soon. + */ + interval_sub->invalidate_seq = + subscriptions->invalidate_seq - 1; + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); + } + spin_unlock(&subscriptions->lock); + return 0; +} + +/** + * mmu_interval_notifier_insert - Insert an interval notifier + * @interval_sub: Interval subscription to register + * @start: Starting virtual address to monitor + * @length: Length of the range to monitor + * @mm: mm_struct to attach to + * @ops: Interval notifier operations to be called on matching events + * + * This function subscribes the interval notifier for notifications from the + * mm. Upon return the ops related to mmu_interval_notifier will be called + * whenever an event that intersects with the given range occurs. + * + * Upon return the range_notifier may not be present in the interval tree yet. + * The caller must use the normal interval notifier read flow via + * mmu_interval_read_begin() to establish SPTEs for this range. + */ +int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_subscriptions *subscriptions; + int ret; + + might_lock(&mm->mmap_lock); + + subscriptions = smp_load_acquire(&mm->notifier_subscriptions); + if (!subscriptions || !subscriptions->has_itree) { + ret = mmu_notifier_register(NULL, mm); + if (ret) + return ret; + subscriptions = mm->notifier_subscriptions; + } + return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, + start, length, ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); + +int mmu_interval_notifier_insert_locked( + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, + unsigned long start, unsigned long length, + const struct mmu_interval_notifier_ops *ops) +{ + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; + int ret; + + mmap_assert_write_locked(mm); + + if (!subscriptions || !subscriptions->has_itree) { + ret = __mmu_notifier_register(NULL, mm); + if (ret) + return ret; + subscriptions = mm->notifier_subscriptions; + } + return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, + start, length, ops); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); + +static bool +mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions, + unsigned long seq) +{ + bool ret; + + spin_lock(&subscriptions->lock); + ret = subscriptions->invalidate_seq != seq; + spin_unlock(&subscriptions->lock); + return ret; +} + +/** + * mmu_interval_notifier_remove - Remove a interval notifier + * @interval_sub: Interval subscription to unregister + * + * This function must be paired with mmu_interval_notifier_insert(). It cannot + * be called from any ops callback. + * + * Once this returns ops callbacks are no longer running on other CPUs and + * will not be called in future. + */ +void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) +{ + struct mm_struct *mm = interval_sub->mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; + unsigned long seq = 0; + + might_sleep(); + + spin_lock(&subscriptions->lock); + if (mn_itree_is_invalidating(subscriptions)) { + /* + * remove is being called after insert put this on the + * deferred list, but before the deferred list was processed. + */ + if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) { + hlist_del(&interval_sub->deferred_item); + } else { + hlist_add_head(&interval_sub->deferred_item, + &subscriptions->deferred_list); + seq = subscriptions->invalidate_seq; + } + } else { + WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb)); + interval_tree_remove(&interval_sub->interval_tree, + &subscriptions->itree); + } + spin_unlock(&subscriptions->lock); + + /* + * The possible sleep on progress in the invalidation requires the + * caller not hold any locks held by invalidation callbacks. + */ + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + if (seq) + wait_event(subscriptions->wq, + mmu_interval_seq_released(subscriptions, seq)); + + /* pairs with mmgrab in mmu_interval_notifier_insert() */ + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove); + +/** + * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed + * + * This function ensures that all outstanding async SRU work from + * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops + * associated with an unused mmu_notifier will no longer be called. + * + * Before using the caller must ensure that all of its mmu_notifiers have been + * fully released via mmu_notifier_put(). + * + * Modules using the mmu_notifier_put() API should call this in their __exit + * function to avoid module unloading races. + */ +void mmu_notifier_synchronize(void) +{ + synchronize_srcu(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mmzone.c b/mm/mmzone.c new file mode 100644 index 000000000..68e1511be --- /dev/null +++ b/mm/mmzone.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/mmzone.c + * + * management codes for pgdats, zones and page flags + */ + + +#include +#include +#include + +struct pglist_data *first_online_pgdat(void) +{ + return NODE_DATA(first_online_node); +} + +struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) +{ + int nid = next_online_node(pgdat->node_id); + + if (nid == MAX_NUMNODES) + return NULL; + return NODE_DATA(nid); +} + +/* + * next_zone - helper magic for for_each_zone() + */ +struct zone *next_zone(struct zone *zone) +{ + pg_data_t *pgdat = zone->zone_pgdat; + + if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) + zone++; + else { + pgdat = next_online_pgdat(pgdat); + if (pgdat) + zone = pgdat->node_zones; + else + zone = NULL; + } + return zone; +} + +static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) +{ +#ifdef CONFIG_NUMA + return node_isset(zonelist_node_idx(zref), *nodes); +#else + return 1; +#endif /* CONFIG_NUMA */ +} + +/* Returns the next zone at or below highest_zoneidx in a zonelist */ +struct zoneref *__next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes) +{ + /* + * Find the next suitable zone to use for the allocation. + * Only filter based on nodemask if it's set + */ + if (unlikely(nodes == NULL)) + while (zonelist_zone_idx(z) > highest_zoneidx) + z++; + else + while (zonelist_zone_idx(z) > highest_zoneidx || + (z->zone && !zref_in_nodemask(z, nodes))) + z++; + + return z; +} + +void lruvec_init(struct lruvec *lruvec) +{ + enum lru_list lru; + + memset(lruvec, 0, sizeof(struct lruvec)); + spin_lock_init(&lruvec->lru_lock); + + for_each_lru(lru) + INIT_LIST_HEAD(&lruvec->lists[lru]); + /* + * The "Unevictable LRU" is imaginary: though its size is maintained, + * it is never scanned, and unevictable pages are not threaded on it + * (so that their lru fields can be reused to hold mlock_count). + * Poison its list head, so that any operations on it would crash. + */ + list_del(&lruvec->lists[LRU_UNEVICTABLE]); + + lru_gen_init_lruvec(lruvec); +} + +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) +int page_cpupid_xchg_last(struct page *page, int cpupid) +{ + unsigned long old_flags, flags; + int last_cpupid; + + old_flags = READ_ONCE(page->flags); + do { + flags = old_flags; + last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + + flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + + return last_cpupid; +} +#endif diff --git a/mm/mprotect.c b/mm/mprotect.c new file mode 100644 index 000000000..668bfaa6e --- /dev/null +++ b/mm/mprotect.c @@ -0,0 +1,875 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static inline bool can_change_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + + if (pte_protnone(pte) || !pte_dirty(pte)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_pte_wp(vma, pte)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* + * We can only special-case on exclusive anonymous pages, + * because we know that our write-fault handler similarly would + * map them writable without any additional checks while holding + * the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + } + + return true; +} + +static unsigned long change_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) +{ + pte_t *pte, oldpte; + spinlock_t *ptl; + unsigned long pages = 0; + int target_node = NUMA_NO_NODE; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + + tlb_change_page_size(tlb, PAGE_SIZE); + + /* + * Can be called with only the mmap_lock for reading by + * prot_numa so we must check the pmd isn't constantly + * changing from under us from pmd_none to pmd_trans_huge + * and/or the other way around. + */ + if (pmd_trans_unstable(pmd)) + return 0; + + /* + * The pmd points to a regular pte so the pmd can't change + * from under us even if the mmap_lock is only hold for + * reading. + */ + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + + flush_tlb_batched_pending(vma->vm_mm); + arch_enter_lazy_mmu_mode(); + do { + oldpte = *pte; + if (pte_present(oldpte)) { + pte_t ptent; + bool preserve_write = prot_numa && pte_write(oldpte); + + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { + struct page *page; + int nid; + bool toptier; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; + + page = vm_normal_page(vma, addr, oldpte); + if (!page || is_zone_device_page(page) || PageKsm(page)) + continue; + + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + page_count(page) != 1) + continue; + + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_lru(page) && PageDirty(page)) + continue; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + nid = page_to_nid(page); + if (target_node == nid) + continue; + toptier = node_is_toptier(nid); + + /* + * Skip scanning top tier node if normal numa + * balancing is disabled + */ + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && + toptier) + continue; + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !toptier) + xchg_page_access_time(page, + jiffies_to_msecs(jiffies)); + } + + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); + if (preserve_write) + ptent = pte_mk_savedwrite(ptent); + + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + ptent = pte_clear_uffd_wp(ptent); + } + + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent) && + can_change_pte_writable(vma, addr, ptent)) + ptent = pte_mkwrite(ptent); + + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); + pages++; + } else if (is_swap_pte(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; + + if (is_writable_migration_entry(entry)) { + struct page *page = pfn_swap_entry_to_page(entry); + + /* + * A protection check is difficult so + * just be safe and disable write + */ + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_writable_device_private_entry(entry)) { + /* + * We do not preserve soft-dirtiness. See + * copy_one_pte() for explanation. + */ + entry = make_readable_device_private_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_writable_device_exclusive_entry(entry)) { + entry = make_readable_device_exclusive_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (pte_marker_entry_uffd_wp(entry)) { + /* + * If this is uffd-wp pte marker and we'd like + * to unprotect it, drop it; the next page + * fault will trigger without uffd trapping. + */ + if (uffd_wp_resolve) { + pte_clear(vma->vm_mm, addr, pte); + pages++; + } + continue; + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); + pages++; + } + } else { + /* It must be an none page, or what else?.. */ + WARN_ON_ONCE(!pte_none(oldpte)); +#ifdef CONFIG_PTE_MARKER_UFFD_WP + if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { + /* + * For file-backed mem, we need to be able to + * wr-protect a none pte, because even if the + * pte is none, the page/swap cache could + * exist. Doing that by install a marker. + */ + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + pages++; + } +#endif + } + } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); + + return pages; +} + +/* + * Used when setting automatic NUMA hinting protection where it is + * critical that a numa hinting PMD is not confused with a bad PMD. + */ +static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) +{ + pmd_t pmdval = pmd_read_atomic(pmd); + + /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + barrier(); +#endif + + if (pmd_none(pmdval)) + return 1; + if (pmd_trans_huge(pmdval)) + return 0; + if (unlikely(pmd_bad(pmdval))) { + pmd_clear_bad(pmd); + return 1; + } + + return 0; +} + +/* Return true if we're uffd wr-protecting file-backed memory, or false */ +static inline bool +uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) +{ + return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); +} + +/* + * If wr-protecting the range for file-backed, populate pgtable for the case + * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() + * failed it means no memory, we don't have a better option but stop. + */ +#define change_pmd_prepare(vma, pmd, cp_flags) \ + do { \ + if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ + if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ + break; \ + } \ + } while (0) +/* + * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to + * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, + * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. + */ +#define change_prepare(vma, high, low, addr, cp_flags) \ + do { \ + if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ + low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ + if (WARN_ON_ONCE(p == NULL)) \ + break; \ + } \ + } while (0) + +static inline unsigned long change_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) +{ + pmd_t *pmd; + unsigned long next; + unsigned long pages = 0; + unsigned long nr_huge_updates = 0; + struct mmu_notifier_range range; + + range.start = 0; + + pmd = pmd_offset(pud, addr); + do { + unsigned long this_pages; + + next = pmd_addr_end(addr, end); + + change_pmd_prepare(vma, pmd, cp_flags); + /* + * Automatic NUMA balancing walks the tables with mmap_lock + * held for read. It's possible a parallel update to occur + * between pmd_trans_huge() and a pmd_none_or_clear_bad() + * check leading to a false positive and clearing. + * Hence, it's necessary to atomically read the PMD value + * for all the checks. + */ + if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && + pmd_none_or_clear_bad_unless_trans_huge(pmd)) + goto next; + + /* invoke the mmu notifier if the pmd is populated */ + if (!range.start) { + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma, vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } + + if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if ((next - addr != HPAGE_PMD_SIZE) || + uffd_wp_protect_file(vma, cp_flags)) { + __split_huge_pmd(vma, pmd, addr, false, NULL); + /* + * For file-backed, the pmd could have been + * cleared; make sure pmd populated if + * necessary, then fall-through to pte level. + */ + change_pmd_prepare(vma, pmd, cp_flags); + } else { + /* + * change_huge_pmd() does not defer TLB flushes, + * so no need to propagate the tlb argument. + */ + int nr_ptes = change_huge_pmd(tlb, vma, pmd, + addr, newprot, cp_flags); + + if (nr_ptes) { + if (nr_ptes == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } + + /* huge pmd was handled */ + goto next; + } + } + /* fall through, the trans huge pmd just split */ + } + this_pages = change_pte_range(tlb, vma, pmd, addr, next, + newprot, cp_flags); + pages += this_pages; +next: + cond_resched(); + } while (pmd++, addr = next, addr != end); + + if (range.start) + mmu_notifier_invalidate_range_end(&range); + + if (nr_huge_updates) + count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); + return pages; +} + +static inline unsigned long change_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) +{ + pud_t *pud; + unsigned long next; + unsigned long pages = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + change_prepare(vma, pud, pmd, addr, cp_flags); + if (pud_none_or_clear_bad(pud)) + continue; + pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, + cp_flags); + } while (pud++, addr = next, addr != end); + + return pages; +} + +static inline unsigned long change_p4d_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) +{ + p4d_t *p4d; + unsigned long next; + unsigned long pages = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + change_prepare(vma, p4d, pud, addr, cp_flags); + if (p4d_none_or_clear_bad(p4d)) + continue; + pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, + cp_flags); + } while (p4d++, addr = next, addr != end); + + return pages; +} + +static unsigned long change_protection_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, + unsigned long end, pgprot_t newprot, unsigned long cp_flags) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long pages = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + tlb_start_vma(tlb, vma); + do { + next = pgd_addr_end(addr, end); + change_prepare(vma, pgd, p4d, addr, cp_flags); + if (pgd_none_or_clear_bad(pgd)) + continue; + pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, + cp_flags); + } while (pgd++, addr = next, addr != end); + + tlb_end_vma(tlb, vma); + + return pages; +} + +unsigned long change_protection(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + unsigned long cp_flags) +{ + unsigned long pages; + + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); + + if (is_vm_hugetlb_page(vma)) + pages = hugetlb_change_protection(vma, start, end, newprot, + cp_flags); + else + pages = change_protection_range(tlb, vma, start, end, newprot, + cp_flags); + + return pages; +} + +static int prot_none_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? + 0 : -EACCES; +} + +static int prot_none_test(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + return 0; +} + +static const struct mm_walk_ops prot_none_walk_ops = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, +}; + +int +mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, + struct vm_area_struct **pprev, unsigned long start, + unsigned long end, unsigned long newflags) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long oldflags = vma->vm_flags; + long nrpages = (end - start) >> PAGE_SHIFT; + unsigned long charged = 0; + bool try_change_writable; + pgoff_t pgoff; + int error; + + if (newflags == oldflags) { + *pprev = vma; + return 0; + } + + /* + * Do PROT_NONE PFN permission checks here when we can still + * bail out without undoing a lot of state. This is a rather + * uncommon case, so doesn't need to be very optimized. + */ + if (arch_has_pfn_modify_check() && + (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (newflags & VM_ACCESS_FLAGS) == 0) { + pgprot_t new_pgprot = vm_get_page_prot(newflags); + + error = walk_page_range(current->mm, start, end, + &prot_none_walk_ops, &new_pgprot); + if (error) + return error; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. hugetlb mapping were accounted for + * even if read-only so there is no need to account for them here + */ + if (newflags & VM_WRITE) { + /* Check space limits when area turns into data. */ + if (!may_expand_vm(mm, newflags, nrpages) && + may_expand_vm(mm, oldflags, nrpages)) + return -ENOMEM; + if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| + VM_SHARED|VM_NORESERVE))) { + charged = nrpages; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + /* + * First try to merge with previous and/or next vma. + */ + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *pprev = vma_merge(mm, *pprev, start, end, newflags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (*pprev) { + vma = *pprev; + VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); + goto success; + } + + *pprev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + +success: + /* + * vm_flags and vm_page_prot are protected by the mmap_lock + * held in write mode. + */ + vma->vm_flags = newflags; + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); + else + try_change_writable = !!(vma->vm_flags & VM_WRITE); + vma_set_page_prot(vma); + + change_protection(tlb, vma, start, end, vma->vm_page_prot, + try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); + + /* + * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major + * fault on access. + */ + if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && + (newflags & VM_WRITE)) { + populate_vma_page_range(vma, start, end, NULL); + } + + vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, newflags, nrpages); + perf_event_mmap(vma); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +/* + * pkey==-1 when doing a legacy mprotect() + */ +static int do_mprotect_pkey(unsigned long start, size_t len, + unsigned long prot, int pkey) +{ + unsigned long nstart, end, tmp, reqprot; + struct vm_area_struct *vma, *prev; + int error; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + const bool rier = (current->personality & READ_IMPLIES_EXEC) && + (prot & PROT_READ); + struct mmu_gather tlb; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + + start = untagged_addr(start); + + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + if (!len) + return 0; + len = PAGE_ALIGN(len); + end = start + len; + if (end <= start) + return -ENOMEM; + if (!arch_validate_prot(prot, start)) + return -EINVAL; + + reqprot = prot; + + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + + /* + * If userspace did not allocate the pkey, do not let + * them use it here. + */ + error = -EINVAL; + if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) + goto out; + + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); + error = -ENOMEM; + if (!vma) + goto out; + + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + + if (start > vma->vm_start) + prev = vma; + else + prev = mas_prev(&mas, 0); + + tlb_gather_mmu(&tlb, current->mm); + for (nstart = start ; ; ) { + unsigned long mask_off_old_flags; + unsigned long newflags; + int new_vma_pkey; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + /* Does the application expect PROT_READ to imply PROT_EXEC */ + if (rier && (vma->vm_flags & VM_MAYEXEC)) + prot |= PROT_EXEC; + + /* + * Each mprotect() call explicitly passes r/w/x permissions. + * If a permission is not passed to mprotect(), it must be + * cleared from the VMA. + */ + mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | + VM_FLAGS_CLEAR; + + new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); + newflags = calc_vm_prot_bits(prot, new_vma_pkey); + newflags |= (vma->vm_flags & ~mask_off_old_flags); + + /* newflags >> 4 shift VM_MAY% in place of VM_% */ + if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { + error = -EACCES; + break; + } + + /* Allow architectures to sanity-check the new flags */ + if (!arch_validate_flags(newflags)) { + error = -EINVAL; + break; + } + + error = security_file_mprotect(vma, reqprot, prot); + if (error) + break; + + tmp = vma->vm_end; + if (tmp > end) + tmp = end; + + if (vma->vm_ops && vma->vm_ops->mprotect) { + error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); + if (error) + break; + } + + error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); + if (error) + break; + + nstart = tmp; + + if (nstart < prev->vm_end) + nstart = prev->vm_end; + if (nstart >= end) + break; + + vma = find_vma(current->mm, prev->vm_end); + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + break; + } + prot = reqprot; + } + tlb_finish_mmu(&tlb); +out: + mmap_write_unlock(current->mm); + return error; +} + +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) +{ + return do_mprotect_pkey(start, len, prot, -1); +} + +#ifdef CONFIG_ARCH_HAS_PKEYS + +SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, + unsigned long, prot, int, pkey) +{ + return do_mprotect_pkey(start, len, prot, pkey); +} + +SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) +{ + int pkey; + int ret; + + /* No flags supported yet. */ + if (flags) + return -EINVAL; + /* check for unsupported init values */ + if (init_val & ~PKEY_ACCESS_MASK) + return -EINVAL; + + mmap_write_lock(current->mm); + pkey = mm_pkey_alloc(current->mm); + + ret = -ENOSPC; + if (pkey == -1) + goto out; + + ret = arch_set_user_pkey_access(current, pkey, init_val); + if (ret) { + mm_pkey_free(current->mm, pkey); + goto out; + } + ret = pkey; +out: + mmap_write_unlock(current->mm); + return ret; +} + +SYSCALL_DEFINE1(pkey_free, int, pkey) +{ + int ret; + + mmap_write_lock(current->mm); + ret = mm_pkey_free(current->mm, pkey); + mmap_write_unlock(current->mm); + + /* + * We could provide warnings or errors if any VMA still + * has the pkey set here. + */ + return ret; +} + +#endif /* CONFIG_ARCH_HAS_PKEYS */ diff --git a/mm/mremap.c b/mm/mremap.c new file mode 100644 index 000000000..930f65c31 --- /dev/null +++ b/mm/mremap.c @@ -0,0 +1,1105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/mremap.c + * + * (C) Copyright 1996 Linus Torvalds + * + * Address space accounting code + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, addr); + if (pgd_none_or_clear_bad(pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none_or_clear_bad(p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none_or_clear_bad(pud)) + return NULL; + + return pud; +} + +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + + return pmd; +} + +static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + + return pud_alloc(mm, p4d, addr); +} + +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = alloc_new_pud(mm, vma, addr); + if (!pud) + return NULL; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + VM_BUG_ON(pmd_trans_huge(*pmd)); + + return pmd; +} + +static void take_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); +} + +static void drop_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); +} + +static pte_t move_soft_dirty_pte(pte_t pte) +{ + /* + * Set soft dirty bit so we can notice + * in userspace the ptes were moved. + */ +#ifdef CONFIG_MEM_SOFT_DIRTY + if (pte_present(pte)) + pte = pte_mksoft_dirty(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_mksoft_dirty(pte); +#endif + return pte; +} + +static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, + unsigned long old_addr, unsigned long old_end, + struct vm_area_struct *new_vma, pmd_t *new_pmd, + unsigned long new_addr, bool need_rmap_locks) +{ + struct mm_struct *mm = vma->vm_mm; + pte_t *old_pte, *new_pte, pte; + spinlock_t *old_ptl, *new_ptl; + bool force_flush = false; + unsigned long len = old_end - old_addr; + + /* + * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma + * locks to ensure that rmap will always observe either the old or the + * new ptes. This is the easiest way to avoid races with + * truncate_pagecache(), page migration, etc... + * + * When need_rmap_locks is false, we use other ways to avoid + * such races: + * + * - During exec() shift_arg_pages(), we use a specially tagged vma + * which rmap call sites look for using vma_is_temporary_stack(). + * + * - During mremap(), new_vma is often known to be placed after vma + * in rmap traversal order. This ensures rmap will always observe + * either the old pte, or the new pte, or both (the page table locks + * serialize access to individual ptes, but only rmap traversal + * order guarantees that we won't miss both the old and new ptes). + */ + if (need_rmap_locks) + take_rmap_locks(vma); + + /* + * We don't have to worry about the ordering of src and dst + * pte locks because exclusive mmap_lock prevents deadlock. + */ + old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + new_pte = pte_offset_map(new_pmd, new_addr); + new_ptl = pte_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + flush_tlb_batched_pending(vma->vm_mm); + arch_enter_lazy_mmu_mode(); + + for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, + new_pte++, new_addr += PAGE_SIZE) { + if (pte_none(*old_pte)) + continue; + + pte = ptep_get_and_clear(mm, old_addr, old_pte); + /* + * If we are remapping a valid PTE, make sure + * to flush TLB before we drop the PTL for the + * PTE. + * + * NOTE! Both old and new PTL matter: the old one + * for racing with page_mkclean(), the new one to + * make sure the physical page stays valid until + * the TLB entry for the old mapping has been + * flushed. + */ + if (pte_present(pte)) + force_flush = true; + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + pte = move_soft_dirty_pte(pte); + set_pte_at(mm, new_addr, new_pte, pte); + } + + arch_leave_lazy_mmu_mode(); + if (force_flush) + flush_tlb_range(vma, old_end - len, old_end); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + pte_unmap(new_pte - 1); + pte_unmap_unlock(old_pte - 1, old_ptl); + if (need_rmap_locks) + drop_rmap_locks(vma); +} + +#ifndef arch_supports_page_table_move +#define arch_supports_page_table_move arch_supports_page_table_move +static inline bool arch_supports_page_table_move(void) +{ + return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || + IS_ENABLED(CONFIG_HAVE_MOVE_PUD); +} +#endif + +#ifdef CONFIG_HAVE_MOVE_PMD +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pmd_t pmd; + + if (!arch_supports_page_table_move()) + return false; + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have released it. + * + * However, there's a case during execve() where we use mremap + * to move the initial stack, and in that case the target area + * may overlap the source area (always moving down). + * + * If everything is PMD-aligned, that works fine, as moving + * each pmd down will clear the source pmd. But if we first + * have a few 4kB-only pages that get moved down, and then + * hit the "now the rest is PMD-aligned, let's do everything + * one pmd at a time", we will still have the old (now empty + * of any 4kB pages, but still there) PMD in the page table + * tree. + * + * Warn on it once - because we really should try to figure + * out how to do this better - but then say "I won't move + * this pmd". + * + * One alternative might be to just unmap the target pmd at + * this point, and verify that it really is empty. We'll see. + */ + if (WARN_ON_ONCE(!pmd_none(*new_pmd))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pmd_lock(vma->vm_mm, old_pmd); + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pmd */ + pmd = *old_pmd; + pmd_clear(old_pmd); + + VM_BUG_ON(!pmd_none(*new_pmd)); + + pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static inline bool move_normal_pmd(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, + pmd_t *new_pmd) +{ + return false; +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) +static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + if (!arch_supports_page_table_move()) + return false; + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + pud_populate(mm, new_pud, pud_pgtable(pud)); + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static inline bool move_normal_pud(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, + pud_t *new_pud) +{ + return false; +} +#endif + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + /* Set the new pud */ + /* mark soft_ditry when we add pud level soft dirty support */ + set_pud_at(mm, new_addr, new_pud, pud); + flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + WARN_ON_ONCE(1); + return false; + +} +#endif + +enum pgt_entry { + NORMAL_PMD, + HPAGE_PMD, + NORMAL_PUD, + HPAGE_PUD, +}; + +/* + * Returns an extent of the corresponding size for the pgt_entry specified if + * valid. Else returns a smaller extent bounded by the end of the source and + * destination pgt_entry. + */ +static __always_inline unsigned long get_extent(enum pgt_entry entry, + unsigned long old_addr, unsigned long old_end, + unsigned long new_addr) +{ + unsigned long next, extent, mask, size; + + switch (entry) { + case HPAGE_PMD: + case NORMAL_PMD: + mask = PMD_MASK; + size = PMD_SIZE; + break; + case HPAGE_PUD: + case NORMAL_PUD: + mask = PUD_MASK; + size = PUD_SIZE; + break; + default: + BUILD_BUG(); + break; + } + + next = (old_addr + size) & mask; + /* even if next overflowed, extent below will be ok */ + extent = next - old_addr; + if (extent > old_end - old_addr) + extent = old_end - old_addr; + next = (new_addr + size) & mask; + if (extent > next - new_addr) + extent = next - new_addr; + return extent; +} + +/* + * Attempts to speedup the move by moving entry at the level corresponding to + * pgt_entry. Returns true if the move was successful, else false. + */ +static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, + void *old_entry, void *new_entry, bool need_rmap_locks) +{ + bool moved = false; + + /* See comment in move_ptes() */ + if (need_rmap_locks) + take_rmap_locks(vma); + + switch (entry) { + case NORMAL_PMD: + moved = move_normal_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case NORMAL_PUD: + moved = move_normal_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case HPAGE_PMD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + move_huge_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case HPAGE_PUD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + move_huge_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + + default: + WARN_ON_ONCE(1); + break; + } + + if (need_rmap_locks) + drop_rmap_locks(vma); + + return moved; +} + +unsigned long move_page_tables(struct vm_area_struct *vma, + unsigned long old_addr, struct vm_area_struct *new_vma, + unsigned long new_addr, unsigned long len, + bool need_rmap_locks) +{ + unsigned long extent, old_end; + struct mmu_notifier_range range; + pmd_t *old_pmd, *new_pmd; + pud_t *old_pud, *new_pud; + + if (!len) + return 0; + + old_end = old_addr + len; + + if (is_vm_hugetlb_page(vma)) + return move_hugetlb_page_tables(vma, new_vma, old_addr, + new_addr, len); + + flush_cache_range(vma, old_addr, old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + old_addr, old_end); + mmu_notifier_invalidate_range_start(&range); + + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { + cond_resched(); + /* + * If extent is PUD-sized try to speed up the move by moving at the + * PUD level if possible. + */ + extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); + + old_pud = get_old_pud(vma->vm_mm, old_addr); + if (!old_pud) + continue; + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + if (!new_pud) + break; + if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { + if (extent == HPAGE_PUD_SIZE) { + move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr, + old_pud, new_pud, need_rmap_locks); + /* We ignore and continue on error? */ + continue; + } + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { + + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, + old_pud, new_pud, true)) + continue; + } + + extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr); + old_pmd = get_old_pmd(vma->vm_mm, old_addr); + if (!old_pmd) + continue; + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); + if (!new_pmd) + break; + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || + pmd_devmap(*old_pmd)) { + if (extent == HPAGE_PMD_SIZE && + move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) + continue; + split_huge_pmd(vma, old_pmd, old_addr); + if (pmd_trans_unstable(old_pmd)) + continue; + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && + extent == PMD_SIZE) { + /* + * If the extent is PMD-sized, try to speed the move by + * moving at the PMD level if possible. + */ + if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, true)) + continue; + } + + if (pte_alloc(new_vma->vm_mm, new_pmd)) + break; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, + new_pmd, new_addr, need_rmap_locks); + } + + mmu_notifier_invalidate_range_end(&range); + + return len + old_addr - old_end; /* how much done */ +} + +static unsigned long move_vma(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long old_len, + unsigned long new_len, unsigned long new_addr, + bool *locked, unsigned long flags, + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) +{ + long to_account = new_len - old_len; + struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *new_vma; + unsigned long vm_flags = vma->vm_flags; + unsigned long new_pgoff; + unsigned long moved_len; + unsigned long excess = 0; + unsigned long hiwater_vm; + int split = 0; + int err = 0; + bool need_rmap_locks; + + /* + * We'd prefer to avoid failure later on in do_munmap: + * which may split one vma into three before unmapping. + */ + if (mm->map_count >= sysctl_max_map_count - 3) + return -ENOMEM; + + if (unlikely(flags & MREMAP_DONTUNMAP)) + to_account = new_len; + + if (vma->vm_ops && vma->vm_ops->may_split) { + if (vma->vm_start != old_addr) + err = vma->vm_ops->may_split(vma, old_addr); + if (!err && vma->vm_end != old_addr + old_len) + err = vma->vm_ops->may_split(vma, old_addr + old_len); + if (err) + return err; + } + + /* + * Advise KSM to break any KSM pages in the area to be moved: + * it would be confusing if they were to turn up at the new + * location, where they happen to coincide with different KSM + * pages recently unmapped. But leave vma->vm_flags as it was, + * so KSM can come around to merge on vma and new_vma afterwards. + */ + err = ksm_madvise(vma, old_addr, old_addr + old_len, + MADV_UNMERGEABLE, &vm_flags); + if (err) + return err; + + if (vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT)) + return -ENOMEM; + } + + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); + if (!new_vma) { + if (vm_flags & VM_ACCOUNT) + vm_unacct_memory(to_account >> PAGE_SHIFT); + return -ENOMEM; + } + + moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, + need_rmap_locks); + if (moved_len < old_len) { + err = -ENOMEM; + } else if (vma->vm_ops && vma->vm_ops->mremap) { + err = vma->vm_ops->mremap(new_vma); + } + + if (unlikely(err)) { + /* + * On error, move entries back from new area to old, + * which will succeed since page tables still there, + * and then proceed to unmap new area instead of old. + */ + move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, + true); + vma = new_vma; + old_len = new_len; + old_addr = new_addr; + new_addr = err; + } else { + mremap_userfaultfd_prep(new_vma, uf); + } + + if (is_vm_hugetlb_page(vma)) { + clear_vma_resv_huge_pages(vma); + } + + /* Conceal VM_ACCOUNT so old reservation is not undone */ + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { + vma->vm_flags &= ~VM_ACCOUNT; + excess = vma->vm_end - vma->vm_start - old_len; + if (old_addr > vma->vm_start && + old_addr + old_len < vma->vm_end) + split = 1; + } + + /* + * If we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len. + * + * Since total_vm is about to be raised artificially high for a + * moment, we need to restore high watermark afterwards: if stats + * are taken meanwhile, total_vm and hiwater_vm appear too high. + * If this were a serious issue, we'd add a flag to do_munmap(). + */ + hiwater_vm = mm->hiwater_vm; + vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); + + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn_moved(vma); + + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { + /* We always clear VM_LOCKED[ONFAULT] on the old vma */ + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + + /* + * anon_vma links of the old vma is no longer needed after its page + * table has been moved. + */ + if (new_vma != vma && vma->vm_start == old_addr && + vma->vm_end == (old_addr + old_len)) + unlink_anon_vmas(vma); + + /* Because we won't unmap we don't need to touch locked_vm */ + return new_addr; + } + + if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { + /* OOM: unable to split vma, just get accounts right */ + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) + vm_acct_memory(old_len >> PAGE_SHIFT); + excess = 0; + } + + if (vm_flags & VM_LOCKED) { + mm->locked_vm += new_len >> PAGE_SHIFT; + *locked = true; + } + + mm->hiwater_vm = hiwater_vm; + + /* Restore VM_ACCOUNT if one or two pieces of vma left */ + if (excess) { + vma->vm_flags |= VM_ACCOUNT; + if (split) + find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; + } + + return new_addr; +} + +static struct vm_area_struct *vma_to_resize(unsigned long addr, + unsigned long old_len, unsigned long new_len, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long pgoff; + + vma = vma_lookup(mm, addr); + if (!vma) + return ERR_PTR(-EFAULT); + + /* + * !old_len is a special case where an attempt is made to 'duplicate' + * a mapping. This makes no sense for private mappings as it will + * instead create a fresh/new mapping unrelated to the original. This + * is contrary to the basic idea of mremap which creates new mappings + * based on the original. There are no known use cases for this + * behavior. As a result, fail such attempts. + */ + if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); + return ERR_PTR(-EINVAL); + } + + if ((flags & MREMAP_DONTUNMAP) && + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) + return ERR_PTR(-EINVAL); + + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + return ERR_PTR(-EFAULT); + + if (new_len == old_len) + return vma; + + /* Need to be careful about a growing mapping */ + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return ERR_PTR(-EINVAL); + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return ERR_PTR(-EFAULT); + + if (mlock_future_check(mm, vma->vm_flags, new_len - old_len)) + return ERR_PTR(-EAGAIN); + + if (!may_expand_vm(mm, vma->vm_flags, + (new_len - old_len) >> PAGE_SHIFT)) + return ERR_PTR(-ENOMEM); + + return vma; +} + +static unsigned long mremap_to(unsigned long addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len, bool *locked, + unsigned long flags, struct vm_userfaultfd_ctx *uf, + struct list_head *uf_unmap_early, + struct list_head *uf_unmap) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long map_flags = 0; + + if (offset_in_page(new_addr)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Ensure the old/new locations do not overlap */ + if (addr + old_len > new_addr && new_addr + new_len > addr) + goto out; + + /* + * move_vma() need us to stay 4 maps below the threshold, otherwise + * it will bail out at the very beginning. + * That is a problem if we have already unmaped the regions here + * (new_addr, and old_addr), because userspace will not know the + * state of the vma's after it gets -ENOMEM. + * So, to avoid such scenario we can pre-compute if the whole + * operation has high chances to success map-wise. + * Worst-scenario case is when both vma's (new_addr and old_addr) get + * split in 3 before unmapping it. + * That means 2 more maps (1 for each) to the ones we already hold. + * Check whether current map count plus 2 still leads us to 4 maps below + * the threshold, otherwise return -ENOMEM here to be more safe. + */ + if ((mm->map_count + 2) >= sysctl_max_map_count - 3) + return -ENOMEM; + + if (flags & MREMAP_FIXED) { + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); + if (ret) + goto out; + } + + if (old_len > new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); + if (ret) + goto out; + old_len = new_len; + } + + vma = vma_to_resize(addr, old_len, new_len, flags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ + if (flags & MREMAP_DONTUNMAP && + !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { + ret = -ENOMEM; + goto out; + } + + if (flags & MREMAP_FIXED) + map_flags |= MAP_FIXED; + + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (IS_ERR_VALUE(ret)) + goto out; + + /* We got a new mapping */ + if (!(flags & MREMAP_FIXED)) + new_addr = ret; + + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, + uf_unmap); + +out: + return ret; +} + +static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) +{ + unsigned long end = vma->vm_end + delta; + + if (end < vma->vm_end) /* overflow */ + return 0; + if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) + return 0; + if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, + 0, MAP_FIXED) & ~PAGE_MASK) + return 0; + return 1; +} + +/* + * Expand (or shrink) an existing mapping, potentially moving it at the + * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise + * This option implies MREMAP_MAYMOVE. + */ +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + bool locked = false; + bool downgraded = false; + struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; + LIST_HEAD(uf_unmap_early); + LIST_HEAD(uf_unmap); + + /* + * There is a deliberate asymmetry here: we strip the pointer tag + * from the old address but leave the new address alone. This is + * for consistency with mmap(), where we prevent the creation of + * aliasing mappings in userspace by leaving the tag bits of the + * mapping address intact. A non-zero tag will cause the subsequent + * range checks to reject the address as invalid. + * + * See Documentation/arm64/tagged-address-abi.rst for more information. + */ + addr = untagged_addr(addr); + + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; + + /* + * MREMAP_DONTUNMAP is always a move and it does not allow resizing + * in the process. + */ + if (flags & MREMAP_DONTUNMAP && + (!(flags & MREMAP_MAYMOVE) || old_len != new_len)) + return ret; + + + if (offset_in_page(addr)) + return ret; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + /* + * We allow a zero old-len as a special case + * for DOS-emu "duplicate shm area" thing. But + * a zero new-len is nonsensical. + */ + if (!new_len) + return ret; + + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + vma = vma_lookup(mm, addr); + if (!vma) { + ret = -EFAULT; + goto out; + } + + if (is_vm_hugetlb_page(vma)) { + struct hstate *h __maybe_unused = hstate_vma(vma); + + old_len = ALIGN(old_len, huge_page_size(h)); + new_len = ALIGN(new_len, huge_page_size(h)); + + /* addrs must be huge page aligned */ + if (addr & ~huge_page_mask(h)) + goto out; + if (new_addr & ~huge_page_mask(h)) + goto out; + + /* + * Don't allow remap expansion, because the underlying hugetlb + * reservation is not yet capable to handle split reservation. + */ + if (new_len > old_len) + goto out; + } + + if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) { + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked, flags, &uf, &uf_unmap_early, + &uf_unmap); + goto out; + } + + /* + * Always allow a shrinking remap: that just unmaps + * the unnecessary pages.. + * do_mas_munmap does all the needed commit accounting, and + * downgrades mmap_lock to read if so directed. + */ + if (old_len >= new_len) { + int retval; + MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); + + retval = do_mas_munmap(&mas, mm, addr + new_len, + old_len - new_len, &uf_unmap, true); + /* Returning 1 indicates mmap_lock is downgraded to read. */ + if (retval == 1) { + downgraded = true; + } else if (retval < 0 && old_len != new_len) { + ret = retval; + goto out; + } + + ret = addr; + goto out; + } + + /* + * Ok, we need to grow.. + */ + vma = vma_to_resize(addr, old_len, new_len, flags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + /* old_len exactly to the end of the area.. + */ + if (old_len == vma->vm_end - addr) { + /* can we just expand the current mapping? */ + if (vma_expandable(vma, new_len - old_len)) { + long pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long extension_start = addr + old_len; + unsigned long extension_end = addr + new_len; + pgoff_t extension_pgoff = vma->vm_pgoff + + ((extension_start - vma->vm_start) >> PAGE_SHIFT); + + if (vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, pages)) { + ret = -ENOMEM; + goto out; + } + } + + /* + * Function vma_merge() is called on the extension we + * are adding to the already existing vma, vma_merge() + * will merge this extension with the already existing + * vma (expand operation itself) and possibly also with + * the next vma if it becomes adjacent to the expanded + * vma and otherwise compatible. + * + * However, vma_merge() can currently fail due to + * is_mergeable_vma() check for vm_ops->close (see the + * comment there). Yet this should not prevent vma + * expanding, so perform a simple expand for such vma. + * Ideally the check for close op should be only done + * when a vma would be actually removed due to a merge. + */ + if (!vma->vm_ops || !vma->vm_ops->close) { + vma = vma_merge(mm, vma, extension_start, extension_end, + vma->vm_flags, vma->anon_vma, vma->vm_file, + extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + } else if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + vma = NULL; + } + if (!vma) { + vm_unacct_memory(pages); + ret = -ENOMEM; + goto out; + } + + vm_stat_account(mm, vma->vm_flags, pages); + if (vma->vm_flags & VM_LOCKED) { + mm->locked_vm += pages; + locked = true; + new_addr = addr; + } + ret = addr; + goto out; + } + } + + /* + * We weren't able to just expand or shrink the area, + * we need to create a new one and move it.. + */ + ret = -ENOMEM; + if (flags & MREMAP_MAYMOVE) { + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (IS_ERR_VALUE(new_addr)) { + ret = new_addr; + goto out; + } + + ret = move_vma(vma, addr, old_len, new_len, new_addr, + &locked, flags, &uf, &uf_unmap); + } +out: + if (offset_in_page(ret)) + locked = false; + if (downgraded) + mmap_read_unlock(current->mm); + else + mmap_write_unlock(current->mm); + if (locked && new_len > old_len) + mm_populate(new_addr + old_len, new_len - old_len); + userfaultfd_unmap_complete(mm, &uf_unmap_early); + mremap_userfaultfd_complete(&uf, addr, ret, old_len); + userfaultfd_unmap_complete(mm, &uf_unmap); + return ret; +} diff --git a/mm/msync.c b/mm/msync.c new file mode 100644 index 000000000..ac4c9bfea --- /dev/null +++ b/mm/msync.c @@ -0,0 +1,114 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/msync.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * The msync() system call. + */ +#include +#include +#include +#include +#include +#include + +/* + * MS_SYNC syncs the entire file - including mappings. + * + * MS_ASYNC does not start I/O (it used to, up to 2.5.67). + * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). + * Now it doesn't do anything, since dirty pages are properly tracked. + * + * The application may now run fsync() to + * write out the dirty pages and wait on the writeout and check the result. + * Or the application may run fadvise(FADV_DONTNEED) against the fd to start + * async writeout immediately. + * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to + * applications. + */ +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) +{ + unsigned long end; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int unmapped_error = 0; + int error = -EINVAL; + + start = untagged_addr(start); + + if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) + goto out; + if (offset_in_page(start)) + goto out; + if ((flags & MS_ASYNC) && (flags & MS_SYNC)) + goto out; + error = -ENOMEM; + len = (len + ~PAGE_MASK) & PAGE_MASK; + end = start + len; + if (end < start) + goto out; + error = 0; + if (end == start) + goto out; + /* + * If the interval [start,end) covers some unmapped address ranges, + * just ignore them, but return -ENOMEM at the end. Besides, if the + * flag is MS_ASYNC (w/o MS_INVALIDATE) the result would be -ENOMEM + * anyway and there is nothing left to do, so return immediately. + */ + mmap_read_lock(mm); + vma = find_vma(mm, start); + for (;;) { + struct file *file; + loff_t fstart, fend; + + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out_unlock; + /* Here start < vma->vm_end. */ + if (start < vma->vm_start) { + if (flags == MS_ASYNC) + goto out_unlock; + start = vma->vm_start; + if (start >= end) + goto out_unlock; + unmapped_error = -ENOMEM; + } + /* Here vma->vm_start <= start < vma->vm_end. */ + if ((flags & MS_INVALIDATE) && + (vma->vm_flags & VM_LOCKED)) { + error = -EBUSY; + goto out_unlock; + } + file = vma->vm_file; + fstart = (start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fend = fstart + (min(end, vma->vm_end) - start) - 1; + start = vma->vm_end; + if ((flags & MS_SYNC) && file && + (vma->vm_flags & VM_SHARED)) { + get_file(file); + mmap_read_unlock(mm); + error = vfs_fsync_range(file, fstart, fend, 1); + fput(file); + if (error || start >= end) + goto out; + mmap_read_lock(mm); + vma = find_vma(mm, start); + } else { + if (start >= end) { + error = 0; + goto out_unlock; + } + vma = find_vma(mm, vma->vm_end); + } + } +out_unlock: + mmap_read_unlock(mm); +out: + return error ? : unmapped_error; +} diff --git a/mm/nommu.c b/mm/nommu.c new file mode 100644 index 000000000..8e8fe491d --- /dev/null +++ b/mm/nommu.c @@ -0,0 +1,1871 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/nommu.c + * + * Replacement code for mm functions to support CPU's that don't + * have any form of memory management unit (thus no virtual memory). + * + * See Documentation/admin-guide/mm/nommu-mmap.rst + * + * Copyright (c) 2004-2008 David Howells + * Copyright (c) 2000-2003 David McCullough + * Copyright (c) 2000-2001 D Jeff Dionne + * Copyright (c) 2002 Greg Ungerer + * Copyright (c) 2007-2010 Paul Mundt + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "internal.h" + +void *high_memory; +EXPORT_SYMBOL(high_memory); +struct page *mem_map; +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); +unsigned long highest_memmap_pfn; +int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +int heap_stack_gap = 0; + +atomic_long_t mmap_pages_allocated; + +EXPORT_SYMBOL(mem_map); + +/* list of mapped, potentially shareable regions */ +static struct kmem_cache *vm_region_jar; +struct rb_root nommu_region_tree = RB_ROOT; +DECLARE_RWSEM(nommu_region_sem); + +const struct vm_operations_struct generic_file_vm_ops = { +}; + +/* + * Return the total memory allocated for this pointer, not + * just what the caller asked for. + * + * Doesn't have to be accurate, i.e. may have races. + */ +unsigned int kobjsize(const void *objp) +{ + struct page *page; + + /* + * If the object we have should not have ksize performed on it, + * return size of 0 + */ + if (!objp || !virt_addr_valid(objp)) + return 0; + + page = virt_to_head_page(objp); + + /* + * If the allocator sets PageSlab, we know the pointer came from + * kmalloc(). + */ + if (PageSlab(page)) + return ksize(objp); + + /* + * If it's not a compound page, see if we have a matching VMA + * region. This test is intentionally done in reverse order, + * so if there's no VMA, we still fall through and hand back + * PAGE_SIZE for 0-order pages. + */ + if (!PageCompound(page)) { + struct vm_area_struct *vma; + + vma = find_vma(current->mm, (unsigned long)objp); + if (vma) + return vma->vm_end - vma->vm_start; + } + + /* + * The ksize() function is only guaranteed to work for pointers + * returned by kmalloc(). So handle arbitrary pointers here. + */ + return page_size(page); +} + +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + +LIST_HEAD(vmap_area_list); + +void vfree(const void *addr) +{ + kfree(addr); +} +EXPORT_SYMBOL(vfree); + +void *__vmalloc(unsigned long size, gfp_t gfp_mask) +{ + /* + * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() + * returns only a logical address. + */ + return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); +} +EXPORT_SYMBOL(__vmalloc); + +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +{ + return __vmalloc(size, gfp_mask); +} + +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) +{ + return __vmalloc(size, gfp_mask); +} + +static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) +{ + void *ret; + + ret = __vmalloc(size, flags); + if (ret) { + struct vm_area_struct *vma; + + mmap_write_lock(current->mm); + vma = find_vma(current->mm, (unsigned long)ret); + if (vma) + vma->vm_flags |= VM_USERMAP; + mmap_write_unlock(current->mm); + } + + return ret; +} + +void *vmalloc_user(unsigned long size) +{ + return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vmalloc_user); + +struct page *vmalloc_to_page(const void *addr) +{ + return virt_to_page(addr); +} +EXPORT_SYMBOL(vmalloc_to_page); + +unsigned long vmalloc_to_pfn(const void *addr) +{ + return page_to_pfn(virt_to_page(addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + +long vread(char *buf, char *addr, unsigned long count) +{ + /* Don't allow overflow */ + if ((unsigned long) buf + count < count) + count = -(unsigned long) buf; + + memcpy(buf, addr, count); + return count; +} + +/* + * vmalloc - allocate virtually contiguous memory + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL); +} +EXPORT_SYMBOL(vmalloc); + +void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc); + +/* + * vzalloc - allocate virtually contiguous memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc_node(unsigned long size, int node) +{ + return vmalloc(size); +} +EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL); +} +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + * + * VM_USERMAP is set on the corresponding VMA so that subsequent calls to + * remap_vmalloc_range() are permissible. + */ +void *vmalloc_32_user(unsigned long size) +{ + /* + * We'll have to sort out the ZONE_DMA bits for 64-bit, + * but for now this can simply use vmalloc_user() directly. + */ + return vmalloc_user(size); +} +EXPORT_SYMBOL(vmalloc_32_user); + +void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vmap); + +void vunmap(const void *addr) +{ + BUG(); +} +EXPORT_SYMBOL(vunmap); + +void *vm_map_ram(struct page **pages, unsigned int count, int node) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL(vm_map_ram); + +void vm_unmap_ram(const void *mem, unsigned int count) +{ + BUG(); +} +EXPORT_SYMBOL(vm_unmap_ram); + +void vm_unmap_aliases(void) +{ +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +void free_vm_area(struct vm_struct *area) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(free_vm_area); + +int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_page); + +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages); + +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages_zero); + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +SYSCALL_DEFINE1(brk, unsigned long, brk) +{ + struct mm_struct *mm = current->mm; + + if (brk < mm->start_brk || brk > mm->context.end_brk) + return mm->brk; + + if (mm->brk == brk) + return mm->brk; + + /* + * Always allow shrinking brk + */ + if (brk <= mm->brk) { + mm->brk = brk; + return brk; + } + + /* + * Ok, looks good - let it rip. + */ + flush_icache_user_range(mm->brk, brk); + return mm->brk = brk; +} + +/* + * initialise the percpu counter for VM and region record slabs + */ +void __init mmap_init(void) +{ + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); + VM_BUG_ON(ret); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); +} + +/* + * validate the region tree + * - the caller must hold the region lock + */ +#ifdef CONFIG_DEBUG_NOMMU_REGIONS +static noinline void validate_nommu_regions(void) +{ + struct vm_region *region, *last; + struct rb_node *p, *lastp; + + lastp = rb_first(&nommu_region_tree); + if (!lastp) + return; + + last = rb_entry(lastp, struct vm_region, vm_rb); + BUG_ON(last->vm_end <= last->vm_start); + BUG_ON(last->vm_top < last->vm_end); + + while ((p = rb_next(lastp))) { + region = rb_entry(p, struct vm_region, vm_rb); + last = rb_entry(lastp, struct vm_region, vm_rb); + + BUG_ON(region->vm_end <= region->vm_start); + BUG_ON(region->vm_top < region->vm_end); + BUG_ON(region->vm_start < last->vm_top); + + lastp = p; + } +} +#else +static void validate_nommu_regions(void) +{ +} +#endif + +/* + * add a region into the global tree + */ +static void add_nommu_region(struct vm_region *region) +{ + struct vm_region *pregion; + struct rb_node **p, *parent; + + validate_nommu_regions(); + + parent = NULL; + p = &nommu_region_tree.rb_node; + while (*p) { + parent = *p; + pregion = rb_entry(parent, struct vm_region, vm_rb); + if (region->vm_start < pregion->vm_start) + p = &(*p)->rb_left; + else if (region->vm_start > pregion->vm_start) + p = &(*p)->rb_right; + else if (pregion == region) + return; + else + BUG(); + } + + rb_link_node(®ion->vm_rb, parent, p); + rb_insert_color(®ion->vm_rb, &nommu_region_tree); + + validate_nommu_regions(); +} + +/* + * delete a region from the global tree + */ +static void delete_nommu_region(struct vm_region *region) +{ + BUG_ON(!nommu_region_tree.rb_node); + + validate_nommu_regions(); + rb_erase(®ion->vm_rb, &nommu_region_tree); + validate_nommu_regions(); +} + +/* + * free a contiguous series of pages + */ +static void free_page_series(unsigned long from, unsigned long to) +{ + for (; from < to; from += PAGE_SIZE) { + struct page *page = virt_to_page((void *)from); + + atomic_long_dec(&mmap_pages_allocated); + put_page(page); + } +} + +/* + * release a reference to a region + * - the caller must hold the region semaphore for writing, which this releases + * - the region may not have been added to the tree yet, in which case vm_top + * will equal vm_start + */ +static void __put_nommu_region(struct vm_region *region) + __releases(nommu_region_sem) +{ + BUG_ON(!nommu_region_tree.rb_node); + + if (--region->vm_usage == 0) { + if (region->vm_top > region->vm_start) + delete_nommu_region(region); + up_write(&nommu_region_sem); + + if (region->vm_file) + fput(region->vm_file); + + /* IO memory and memory shared directly out of the pagecache + * from ramfs/tmpfs mustn't be released here */ + if (region->vm_flags & VM_MAPPED_COPY) + free_page_series(region->vm_start, region->vm_top); + kmem_cache_free(vm_region_jar, region); + } else { + up_write(&nommu_region_sem); + } +} + +/* + * release a reference to a region + */ +static void put_nommu_region(struct vm_region *region) +{ + down_write(&nommu_region_sem); + __put_nommu_region(region); +} + +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} + +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} + +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) +{ + vma->vm_mm = mm; + + /* add the VMA to the mapping */ + if (vma->vm_file) { + struct address_space *mapping = vma->vm_file->f_mapping; + + i_mmap_lock_write(mapping); + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } +} + +/* + * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). + * @mas: The maple state with preallocations. + * @mm: The mm_struct + * @vma: The vma to add + * + */ +static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, + struct vm_area_struct *vma) +{ + BUG_ON(!vma->vm_region); + + setup_vma_to_mm(vma, mm); + mm->map_count++; + + /* add the VMA to the tree */ + vma_mas_store(vma, mas); +} + +/* + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_lock held writelocked + */ +static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + mas_add_vma_to_mm(&mas, mm, vma); + return 0; +} + +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ + vma->vm_mm->map_count--; + /* remove the VMA from the mapping */ + if (vma->vm_file) { + struct address_space *mapping; + mapping = vma->vm_file->f_mapping; + + i_mmap_lock_write(mapping); + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } +} +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); + + /* remove from the MM's tree and list */ + vma_mas_remove(vma, &mas); + return 0; +} + +/* + * destroy a VMA record + */ +static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) +{ + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + if (vma->vm_file) + fput(vma->vm_file); + put_nommu_region(vma->vm_region); + vm_area_free(vma); +} + +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_lock at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + MA_STATE(mas, &mm->mm_mt, addr, addr); + + return mas_walk(&mas); +} +EXPORT_SYMBOL(find_vma); + +/* + * At least xtensa ends up having protection faults even with no + * MMU.. No stack expansion, at least. + */ +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) +{ + struct vm_area_struct *vma; + + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) + mmap_read_unlock(mm); + return vma; +} + +/* + * expand a stack to a given address + * - not supported under NOMMU conditions + */ +int expand_stack_locked(struct vm_area_struct *vma, unsigned long addr) +{ + return -ENOMEM; +} + +struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr) +{ + mmap_read_unlock(mm); + return NULL; +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_lock at least held readlocked + */ +static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + unsigned long end = addr + len; + MA_STATE(mas, &mm->mm_mt, addr, addr); + + vma = mas_walk(&mas); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; + + return vma; +} + +/* + * determine whether a mapping should be permitted and, if so, what sort of + * mapping we're capable of supporting + */ +static int validate_mmap_request(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff, + unsigned long *_capabilities) +{ + unsigned long capabilities, rlen; + int ret; + + /* do the simple checks first */ + if (flags & MAP_FIXED) + return -EINVAL; + + if ((flags & MAP_TYPE) != MAP_PRIVATE && + (flags & MAP_TYPE) != MAP_SHARED) + return -EINVAL; + + if (!len) + return -EINVAL; + + /* Careful about overflows.. */ + rlen = PAGE_ALIGN(len); + if (!rlen || rlen > TASK_SIZE) + return -ENOMEM; + + /* offset overflow? */ + if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) + return -EOVERFLOW; + + if (file) { + /* files must support mmap */ + if (!file->f_op->mmap) + return -ENODEV; + + /* work out if what we've got could possibly be shared + * - we support chardevs that provide their own "memory" + * - we support files/blockdevs that are memory backed + */ + if (file->f_op->mmap_capabilities) { + capabilities = file->f_op->mmap_capabilities(file); + } else { + /* no explicit capabilities set, so assume some + * defaults */ + switch (file_inode(file)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFBLK: + capabilities = NOMMU_MAP_COPY; + break; + + case S_IFCHR: + capabilities = + NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | + NOMMU_MAP_WRITE; + break; + + default: + return -EINVAL; + } + } + + /* eliminate any capabilities that we can't support on this + * device */ + if (!file->f_op->get_unmapped_area) + capabilities &= ~NOMMU_MAP_DIRECT; + if (!(file->f_mode & FMODE_CAN_READ)) + capabilities &= ~NOMMU_MAP_COPY; + + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + + if (flags & MAP_SHARED) { + /* do checks for writing, appending and locking */ + if ((prot & PROT_WRITE) && + !(file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (IS_APPEND(file_inode(file)) && + (file->f_mode & FMODE_WRITE)) + return -EACCES; + + if (!(capabilities & NOMMU_MAP_DIRECT)) + return -ENODEV; + + /* we mustn't privatise shared mappings */ + capabilities &= ~NOMMU_MAP_COPY; + } else { + /* we're going to read the file into private memory we + * allocate */ + if (!(capabilities & NOMMU_MAP_COPY)) + return -ENODEV; + + /* we don't permit a private writable mapping to be + * shared with the backing device */ + if (prot & PROT_WRITE) + capabilities &= ~NOMMU_MAP_DIRECT; + } + + if (capabilities & NOMMU_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || + ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || + ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) + ) { + capabilities &= ~NOMMU_MAP_DIRECT; + if (flags & MAP_SHARED) { + pr_warn("MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + } + } + + /* handle executable mappings and implied executable + * mappings */ + if (path_noexec(&file->f_path)) { + if (prot & PROT_EXEC) + return -EPERM; + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + /* handle implication of PROT_EXEC by PROT_READ */ + if (current->personality & READ_IMPLIES_EXEC) { + if (capabilities & NOMMU_MAP_EXEC) + prot |= PROT_EXEC; + } + } else if ((prot & PROT_READ) && + (prot & PROT_EXEC) && + !(capabilities & NOMMU_MAP_EXEC) + ) { + /* backing file is not executable, try to copy */ + capabilities &= ~NOMMU_MAP_DIRECT; + } + } else { + /* anonymous mappings are always memory backed and can be + * privately mapped + */ + capabilities = NOMMU_MAP_COPY; + + /* handle PROT_EXEC implication by PROT_READ */ + if ((prot & PROT_READ) && + (current->personality & READ_IMPLIES_EXEC)) + prot |= PROT_EXEC; + } + + /* allow the security API to have its say */ + ret = security_mmap_addr(addr); + if (ret < 0) + return ret; + + /* looks okay */ + *_capabilities = capabilities; + return 0; +} + +/* + * we've determined that we can make the mapping, now translate what we + * now know into VMA flags + */ +static unsigned long determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) +{ + unsigned long vm_flags; + + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); + /* vm_flags |= mm->def_flags; */ + + if (!(capabilities & NOMMU_MAP_DIRECT)) { + /* attempt to share read-only copies of mapped file chunks */ + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (file && !(prot & PROT_WRITE)) + vm_flags |= VM_MAYSHARE; + } else { + /* overlay a shareable mapping on the backing device or inode + * if possible - used for chardevs, ramfs/tmpfs/shmfs and + * romfs/cramfs */ + vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); + if (flags & MAP_SHARED) + vm_flags |= VM_SHARED; + } + + /* refuse to let anyone share private mappings with this process if + * it's being traced - otherwise breakpoints set in it may interfere + * with another untraced process + */ + if ((flags & MAP_PRIVATE) && current->ptrace) + vm_flags &= ~VM_MAYSHARE; + + return vm_flags; +} + +/* + * set up a shared mapping on a file (the driver or filesystem provides and + * pins the storage) + */ +static int do_mmap_shared_file(struct vm_area_struct *vma) +{ + int ret; + + ret = call_mmap(vma->vm_file, vma); + if (ret == 0) { + vma->vm_region->vm_top = vma->vm_region->vm_end; + return 0; + } + if (ret != -ENOSYS) + return ret; + + /* getting -ENOSYS indicates that direct mmap isn't possible (as + * opposed to tried but failed) so we can only give a suitable error as + * it's not possible to make a private copy if MAP_SHARED was given */ + return -ENODEV; +} + +/* + * set up a private mapping or an anonymous shared mapping + */ +static int do_mmap_private(struct vm_area_struct *vma, + struct vm_region *region, + unsigned long len, + unsigned long capabilities) +{ + unsigned long total, point; + void *base; + int ret, order; + + /* invoke the file's mapping function so that it can keep track of + * shared mappings on devices or memory + * - VM_MAYSHARE will be set if it may attempt to share + */ + if (capabilities & NOMMU_MAP_DIRECT) { + ret = call_mmap(vma->vm_file, vma); + if (ret == 0) { + /* shouldn't return success if we're not sharing */ + BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); + vma->vm_region->vm_top = vma->vm_region->vm_end; + return 0; + } + if (ret != -ENOSYS) + return ret; + + /* getting an ENOSYS error indicates that direct mmap isn't + * possible (as opposed to tried but failed) so we'll try to + * make a private copy of the data and map that instead */ + } + + + /* allocate some memory to hold the mapping + * - note that this may not return a page-aligned address if the object + * we're allocating is smaller than a page + */ + order = get_order(len); + total = 1 << order; + point = len >> PAGE_SHIFT; + + /* we don't want to allocate a power-of-2 sized page set */ + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) + total = point; + + base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); + if (!base) + goto enomem; + + atomic_long_add(total, &mmap_pages_allocated); + + region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; + region->vm_start = (unsigned long) base; + region->vm_end = region->vm_start + len; + region->vm_top = region->vm_start + (total << PAGE_SHIFT); + + vma->vm_start = region->vm_start; + vma->vm_end = region->vm_start + len; + + if (vma->vm_file) { + /* read the contents of a file into the copy */ + loff_t fpos; + + fpos = vma->vm_pgoff; + fpos <<= PAGE_SHIFT; + + ret = kernel_read(vma->vm_file, base, len, &fpos); + if (ret < 0) + goto error_free; + + /* clear the last little bit */ + if (ret < len) + memset(base + ret, 0, len - ret); + + } else { + vma_set_anonymous(vma); + } + + return 0; + +error_free: + free_page_series(region->vm_start, region->vm_top); + region->vm_start = vma->vm_start = 0; + region->vm_end = vma->vm_end = 0; + region->vm_top = 0; + return ret; + +enomem: + pr_err("Allocation of length %lu from process %d (%s) failed\n", + len, current->pid, current->comm); + show_free_areas(0, NULL); + return -ENOMEM; +} + +/* + * handle mapping creation for uClinux + */ +unsigned long do_mmap(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + unsigned long pgoff, + unsigned long *populate, + struct list_head *uf) +{ + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *rb; + vm_flags_t vm_flags; + unsigned long capabilities, result; + int ret; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); + + *populate = 0; + + /* decide whether we should attempt the mapping, and if so what sort of + * mapping */ + ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, + &capabilities); + if (ret < 0) + return ret; + + /* we ignore the address hint */ + addr = 0; + len = PAGE_ALIGN(len); + + /* we've determined that we can make the mapping, now translate what we + * now know into VMA flags */ + vm_flags = determine_vm_flags(file, prot, flags, capabilities); + + + /* we're going to need to record the mapping */ + region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); + if (!region) + goto error_getting_region; + + vma = vm_area_alloc(current->mm); + if (!vma) + goto error_getting_vma; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + goto error_maple_preallocate; + + region->vm_usage = 1; + region->vm_flags = vm_flags; + region->vm_pgoff = pgoff; + + vma->vm_flags = vm_flags; + vma->vm_pgoff = pgoff; + + if (file) { + region->vm_file = get_file(file); + vma->vm_file = get_file(file); + } + + down_write(&nommu_region_sem); + + /* if we want to share, we need to check for regions created by other + * mmap() calls that overlap with our proposed mapping + * - we can only share with a superset match on most regular files + * - shared mappings on character devices and memory backed files are + * permitted to overlap inexactly as far as we are concerned for in + * these cases, sharing is handled in the driver or filesystem rather + * than here + */ + if (vm_flags & VM_MAYSHARE) { + struct vm_region *pregion; + unsigned long pglen, rpglen, pgend, rpgend, start; + + pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgend = pgoff + pglen; + + for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { + pregion = rb_entry(rb, struct vm_region, vm_rb); + + if (!(pregion->vm_flags & VM_MAYSHARE)) + continue; + + /* search for overlapping mappings on the same file */ + if (file_inode(pregion->vm_file) != + file_inode(file)) + continue; + + if (pregion->vm_pgoff >= pgend) + continue; + + rpglen = pregion->vm_end - pregion->vm_start; + rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; + rpgend = pregion->vm_pgoff + rpglen; + if (pgoff >= rpgend) + continue; + + /* handle inexactly overlapping matches between + * mappings */ + if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && + !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { + /* new mapping is not a subset of the region */ + if (!(capabilities & NOMMU_MAP_DIRECT)) + goto sharing_violation; + continue; + } + + /* we've found a region we can share */ + pregion->vm_usage++; + vma->vm_region = pregion; + start = pregion->vm_start; + start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; + vma->vm_start = start; + vma->vm_end = start + len; + + if (pregion->vm_flags & VM_MAPPED_COPY) + vma->vm_flags |= VM_MAPPED_COPY; + else { + ret = do_mmap_shared_file(vma); + if (ret < 0) { + vma->vm_region = NULL; + vma->vm_start = 0; + vma->vm_end = 0; + pregion->vm_usage--; + pregion = NULL; + goto error_just_free; + } + } + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + region = pregion; + result = start; + goto share; + } + + /* obtain the address at which to make a shared mapping + * - this is the hook for quasi-memory character devices to + * tell us the location of a shared mapping + */ + if (capabilities & NOMMU_MAP_DIRECT) { + addr = file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + if (IS_ERR_VALUE(addr)) { + ret = addr; + if (ret != -ENOSYS) + goto error_just_free; + + /* the driver refused to tell us where to site + * the mapping so we'll have to attempt to copy + * it */ + ret = -ENODEV; + if (!(capabilities & NOMMU_MAP_COPY)) + goto error_just_free; + + capabilities &= ~NOMMU_MAP_DIRECT; + } else { + vma->vm_start = region->vm_start = addr; + vma->vm_end = region->vm_end = addr + len; + } + } + } + + vma->vm_region = region; + + /* set up the mapping + * - the region is filled in if NOMMU_MAP_DIRECT is still set + */ + if (file && vma->vm_flags & VM_SHARED) + ret = do_mmap_shared_file(vma); + else + ret = do_mmap_private(vma, region, len, capabilities); + if (ret < 0) + goto error_just_free; + add_nommu_region(region); + + /* clear anonymous mappings that don't ask for uninitialized data */ + if (!vma->vm_file && + (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) || + !(flags & MAP_UNINITIALIZED))) + memset((void *)region->vm_start, 0, + region->vm_end - region->vm_start); + + /* okay... we have a mapping; now we have to register it */ + result = vma->vm_start; + + current->mm->total_vm += len >> PAGE_SHIFT; + +share: + mas_add_vma_to_mm(&mas, current->mm, vma); + + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_user_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } + + up_write(&nommu_region_sem); + + return result; + +error_just_free: + up_write(&nommu_region_sem); +error: + mas_destroy(&mas); + if (region->vm_file) + fput(region->vm_file); + kmem_cache_free(vm_region_jar, region); + if (vma->vm_file) + fput(vma->vm_file); + vm_area_free(vma); + return ret; + +sharing_violation: + up_write(&nommu_region_sem); + pr_warn("Attempt to share mismatched mappings\n"); + ret = -EINVAL; + goto error; + +error_getting_vma: + kmem_cache_free(vm_region_jar, region); + pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", + len, current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + +error_getting_region: + pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", + len, current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + +error_maple_preallocate: + kmem_cache_free(vm_region_jar, region); + vm_area_free(vma); + pr_warn("Allocation of vma tree for process %d failed\n", current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + +} + +unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + audit_mmap_fd(fd, flags); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + + if (file) + fput(file); +out: + return retval; +} + +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (offset_in_page(a.offset)) + return -EINVAL; + + return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + +/* + * split a vma into two pieces at address 'addr', a new vma is allocated either + * for the first part or the tail. + */ +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + struct vm_area_struct *new; + struct vm_region *region; + unsigned long npages; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) + return -ENOMEM; + + mm = vma->vm_mm; + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); + if (!region) + return -ENOMEM; + + new = vm_area_dup(vma); + if (!new) + goto err_vma_dup; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_mas_preallocate; + } + + /* most fields are the same, copy all, and then fixup */ + *region = *vma->vm_region; + new->vm_region = region; + + npages = (addr - vma->vm_start) >> PAGE_SHIFT; + + if (new_below) { + region->vm_top = region->vm_end = new->vm_end = addr; + } else { + region->vm_start = new->vm_start = addr; + region->vm_pgoff = new->vm_pgoff += npages; + } + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + down_write(&nommu_region_sem); + delete_nommu_region(vma->vm_region); + if (new_below) { + vma->vm_region->vm_start = vma->vm_start = addr; + vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; + } else { + vma->vm_region->vm_end = vma->vm_end = addr; + vma->vm_region->vm_top = addr; + } + add_nommu_region(vma->vm_region); + add_nommu_region(new->vm_region); + up_write(&nommu_region_sem); + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); + mas_store(&mas, vma); + vma_mas_store(new, &mas); + mm->map_count++; + return 0; + +err_mas_preallocate: + vm_area_free(new); +err_vma_dup: + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; +} + +/* + * shrink a VMA by removing the specified chunk from either the beginning or + * the end + */ +static int shrink_vma(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long from, unsigned long to) +{ + struct vm_region *region; + + /* adjust the VMA's pointers, which may reposition it in the MM's tree + * and list */ + if (delete_vma_from_mm(vma)) + return -ENOMEM; + if (from > vma->vm_start) + vma->vm_end = from; + else + vma->vm_start = to; + if (add_vma_to_mm(mm, vma)) + return -ENOMEM; + + /* cut the backing region down to size */ + region = vma->vm_region; + BUG_ON(region->vm_usage != 1); + + down_write(&nommu_region_sem); + delete_nommu_region(region); + if (from > region->vm_start) { + to = region->vm_top; + region->vm_top = region->vm_end = from; + } else { + region->vm_start = to; + } + add_nommu_region(region); + up_write(&nommu_region_sem); + + free_page_series(from, to); + return 0; +} + +/* + * release a mapping + * - under NOMMU conditions the chunk to be unmapped must be backed by a single + * VMA, though it need not cover the whole VMA + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) +{ + MA_STATE(mas, &mm->mm_mt, start, start); + struct vm_area_struct *vma; + unsigned long end; + int ret = 0; + + len = PAGE_ALIGN(len); + if (len == 0) + return -EINVAL; + + end = start + len; + + /* find the first potentially overlapping VMA */ + vma = mas_find(&mas, end - 1); + if (!vma) { + static int limit; + if (limit < 5) { + pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); + limit++; + } + return -EINVAL; + } + + /* we're allowed to split an anonymous VMA but not a file-backed one */ + if (vma->vm_file) { + do { + if (start > vma->vm_start) + return -EINVAL; + if (end == vma->vm_end) + goto erase_whole_vma; + vma = mas_next(&mas, end - 1); + } while (vma); + return -EINVAL; + } else { + /* the chunk must be a subset of the VMA found */ + if (start == vma->vm_start && end == vma->vm_end) + goto erase_whole_vma; + if (start < vma->vm_start || end > vma->vm_end) + return -EINVAL; + if (offset_in_page(start)) + return -EINVAL; + if (end != vma->vm_end && offset_in_page(end)) + return -EINVAL; + if (start != vma->vm_start && end != vma->vm_end) { + ret = split_vma(mm, vma, start, 1); + if (ret < 0) + return ret; + } + return shrink_vma(mm, vma, start, end); + } + +erase_whole_vma: + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; + else + delete_vma(mm, vma); + return ret; +} + +int vm_munmap(unsigned long addr, size_t len) +{ + struct mm_struct *mm = current->mm; + int ret; + + mmap_write_lock(mm); + ret = do_munmap(mm, addr, len, NULL); + mmap_write_unlock(mm); + return ret; +} +EXPORT_SYMBOL(vm_munmap); + +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) +{ + return vm_munmap(addr, len); +} + +/* + * release all the mappings made in a process's VM space + */ +void exit_mmap(struct mm_struct *mm) +{ + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma; + + if (!mm) + return; + + mm->total_vm = 0; + + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); + delete_vma(mm, vma); + cond_resched(); + } + __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); +} + +int vm_brk(unsigned long addr, unsigned long len) +{ + return -ENOMEM; +} + +/* + * expand (or shrink) an existing mapping, potentially moving it at the same + * time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * + * under NOMMU conditions, we only permit changing a mapping's size, and only + * as long as it stays within the region allocated by do_mmap_private() and the + * block is not shareable + * + * MREMAP_FIXED is not supported under NOMMU conditions + */ +static unsigned long do_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) +{ + struct vm_area_struct *vma; + + /* insanity checks first */ + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + if (old_len == 0 || new_len == 0) + return (unsigned long) -EINVAL; + + if (offset_in_page(addr)) + return -EINVAL; + + if (flags & MREMAP_FIXED && new_addr != addr) + return (unsigned long) -EINVAL; + + vma = find_vma_exact(current->mm, addr, old_len); + if (!vma) + return (unsigned long) -EINVAL; + + if (vma->vm_end != vma->vm_start + old_len) + return (unsigned long) -EFAULT; + + if (vma->vm_flags & VM_MAYSHARE) + return (unsigned long) -EPERM; + + if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) + return (unsigned long) -ENOMEM; + + /* all checks complete - do it */ + vma->vm_end = vma->vm_start + new_len; + return vma->vm_start; +} + +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) +{ + unsigned long ret; + + mmap_write_lock(current->mm); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + mmap_write_unlock(current->mm); + return ret; +} + +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, + unsigned int foll_flags) +{ + return NULL; +} + +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + if (addr != (pfn << PAGE_SHIFT)) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + return 0; +} +EXPORT_SYMBOL(remap_pfn_range); + +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long pfn = start >> PAGE_SHIFT; + unsigned long vm_len = vma->vm_end - vma->vm_start; + + pfn += vma->vm_pgoff; + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + unsigned int size = vma->vm_end - vma->vm_start; + + if (!(vma->vm_flags & VM_USERMAP)) + return -EINVAL; + + vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); + vma->vm_end = vma->vm_start + size; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_range); + +vm_fault_t filemap_fault(struct vm_fault *vmf) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(filemap_fault); + +vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + BUG(); + return 0; +} +EXPORT_SYMBOL(filemap_map_pages); + +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) +{ + struct vm_area_struct *vma; + int write = gup_flags & FOLL_WRITE; + + if (mmap_read_lock_killable(mm)) + return 0; + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (vma) { + /* don't overrun this mapping */ + if (addr + len >= vma->vm_end) + len = vma->vm_end - addr; + + /* only read or write mappings where it is permitted */ + if (write && vma->vm_flags & VM_MAYWRITE) + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); + else if (!write && vma->vm_flags & VM_MAYREAD) + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); + else + len = 0; + } else { + len = 0; + } + + mmap_read_unlock(mm); + + return len; +} + +/** + * access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @gup_flags: flags modifying lookup behaviour + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) +{ + return __access_remote_vm(mm, addr, buf, len, gup_flags); +} + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, + unsigned int gup_flags) +{ + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + len = __access_remote_vm(mm, addr, buf, len, gup_flags); + + mmput(mm); + return len; +} +EXPORT_SYMBOL_GPL(access_process_vm); + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend beyond so that do_mmap() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + i_mmap_lock_read(inode->i_mapping); + + /* search for VMAs that fall within the dead zone */ + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + i_mmap_unlock_read(inode->i_mapping); + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + i_mmap_unlock_read(inode->i_mapping); + up_write(&nommu_region_sem); + return 0; +} + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +subsys_initcall(init_user_reserve); + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +subsys_initcall(init_admin_reserve); diff --git a/mm/oom_kill.c b/mm/oom_kill.c new file mode 100644 index 000000000..1276e49b3 --- /dev/null +++ b/mm/oom_kill.c @@ -0,0 +1,1262 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/oom_kill.c + * + * Copyright (C) 1998,2000 Rik van Riel + * Thanks go out to Claus Fischer for some serious inspiration and + * for goading me into coding this file... + * Copyright (C) 2010 Google, Inc. + * Rewritten by David Rientjes + * + * The routines in this file are used to kill a process when + * we're seriously out of memory. This gets called from __alloc_pages() + * in mm/page_alloc.c when we really run out of memory. + * + * Since we won't call these routines often (on a well-configured + * machine) this file will double as a 'coding guide' and a signpost + * for newbie kernel hackers. It features several pointers to major + * kernel subsystems and hints as to where to find out what things do. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "internal.h" +#include "slab.h" + +#define CREATE_TRACE_POINTS +#include + +static int sysctl_panic_on_oom; +static int sysctl_oom_kill_allocating_task; +static int sysctl_oom_dump_tasks = 1; + +/* + * Serializes oom killer invocations (out_of_memory()) from all contexts to + * prevent from over eager oom killing (e.g. when the oom killer is invoked + * from different domains). + * + * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled + * and mark_oom_victim + */ +DEFINE_MUTEX(oom_lock); +/* Serializes oom_score_adj and oom_score_adj_min updates */ +DEFINE_MUTEX(oom_adj_mutex); + +static inline bool is_memcg_oom(struct oom_control *oc) +{ + return oc->memcg != NULL; +} + +#ifdef CONFIG_NUMA +/** + * oom_cpuset_eligible() - check task eligibility for kill + * @start: task struct of which task to consider + * @oc: pointer to struct oom_control + * + * Task eligibility is determined by whether or not a candidate task, @tsk, + * shares the same mempolicy nodes as current if it is bound by such a policy + * and whether or not it has the same set of allowed cpuset nodes. + * + * This function is assuming oom-killer context and 'current' has triggered + * the oom-killer. + */ +static bool oom_cpuset_eligible(struct task_struct *start, + struct oom_control *oc) +{ + struct task_struct *tsk; + bool ret = false; + const nodemask_t *mask = oc->nodemask; + + rcu_read_lock(); + for_each_thread(start, tsk) { + if (mask) { + /* + * If this is a mempolicy constrained oom, tsk's + * cpuset is irrelevant. Only return true if its + * mempolicy intersects current, otherwise it may be + * needlessly killed. + */ + ret = mempolicy_in_oom_domain(tsk, mask); + } else { + /* + * This is not a mempolicy constrained oom, so only + * check the mems of tsk's cpuset. + */ + ret = cpuset_mems_allowed_intersects(current, tsk); + } + if (ret) + break; + } + rcu_read_unlock(); + + return ret; +} +#else +static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc) +{ + return true; +} +#endif /* CONFIG_NUMA */ + +/* + * The process p may have detached its own ->mm while exiting or through + * kthread_use_mm(), but one or more of its subthreads may still have a valid + * pointer. Return p, or any of its subthreads with a valid ->mm, with + * task_lock() held. + */ +struct task_struct *find_lock_task_mm(struct task_struct *p) +{ + struct task_struct *t; + + rcu_read_lock(); + + for_each_thread(p, t) { + task_lock(t); + if (likely(t->mm)) + goto found; + task_unlock(t); + } + t = NULL; +found: + rcu_read_unlock(); + + return t; +} + +/* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ +static inline bool is_sysrq_oom(struct oom_control *oc) +{ + return oc->order == -1; +} + +/* return true if the task is not adequate as candidate victim task. */ +static bool oom_unkillable_task(struct task_struct *p) +{ + if (is_global_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + return false; +} + +/* + * Check whether unreclaimable slab amount is greater than + * all user memory(LRU pages). + * dump_unreclaimable_slab() could help in the case that + * oom due to too much unreclaimable slab used by kernel. +*/ +static bool should_dump_unreclaim_slab(void) +{ + unsigned long nr_lru; + + nr_lru = global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + + global_node_page_state(NR_ISOLATED_ANON) + + global_node_page_state(NR_ISOLATED_FILE) + + global_node_page_state(NR_UNEVICTABLE); + + return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); +} + +/** + * oom_badness - heuristic function to determine which candidate task to kill + * @p: task struct of which task we should calculate + * @totalpages: total present RAM allowed for page allocation + * + * The heuristic for determining which task to kill is made to be as simple and + * predictable as possible. The goal is to return the highest value for the + * task consuming the most memory to avoid subsequent oom failures. + */ +long oom_badness(struct task_struct *p, unsigned long totalpages) +{ + long points; + long adj; + + if (oom_unkillable_task(p)) + return LONG_MIN; + + p = find_lock_task_mm(p); + if (!p) + return LONG_MIN; + + /* + * Do not even consider tasks which are explicitly marked oom + * unkillable or have been already oom reaped or the are in + * the middle of vfork + */ + adj = (long)p->signal->oom_score_adj; + if (adj == OOM_SCORE_ADJ_MIN || + test_bit(MMF_OOM_SKIP, &p->mm->flags) || + in_vfork(p)) { + task_unlock(p); + return LONG_MIN; + } + + /* + * The baseline for the badness score is the proportion of RAM that each + * task's rss, pagetable and swap space use. + */ + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + mm_pgtables_bytes(p->mm) / PAGE_SIZE; + task_unlock(p); + + /* Normalize to oom_score_adj units */ + adj *= totalpages / 1000; + points += adj; + + return points; +} + +static const char * const oom_constraint_text[] = { + [CONSTRAINT_NONE] = "CONSTRAINT_NONE", + [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", + [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", + [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", +}; + +/* + * Determine the type of allocation constraint. + */ +static enum oom_constraint constrained_alloc(struct oom_control *oc) +{ + struct zone *zone; + struct zoneref *z; + enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask); + bool cpuset_limited = false; + int nid; + + if (is_memcg_oom(oc)) { + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; + return CONSTRAINT_MEMCG; + } + + /* Default to all available memory */ + oc->totalpages = totalram_pages() + total_swap_pages; + + if (!IS_ENABLED(CONFIG_NUMA)) + return CONSTRAINT_NONE; + + if (!oc->zonelist) + return CONSTRAINT_NONE; + /* + * Reach here only when __GFP_NOFAIL is used. So, we should avoid + * to kill current.We have to random task kill in this case. + * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. + */ + if (oc->gfp_mask & __GFP_THISNODE) + return CONSTRAINT_NONE; + + /* + * This is not a __GFP_THISNODE allocation, so a truncated nodemask in + * the page allocator means a mempolicy is in effect. Cpuset policy + * is enforced in get_page_from_freelist(). + */ + if (oc->nodemask && + !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { + oc->totalpages = total_swap_pages; + for_each_node_mask(nid, *oc->nodemask) + oc->totalpages += node_present_pages(nid); + return CONSTRAINT_MEMORY_POLICY; + } + + /* Check this allocation failure is caused by cpuset's wall function */ + for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, + highest_zoneidx, oc->nodemask) + if (!cpuset_zone_allowed(zone, oc->gfp_mask)) + cpuset_limited = true; + + if (cpuset_limited) { + oc->totalpages = total_swap_pages; + for_each_node_mask(nid, cpuset_current_mems_allowed) + oc->totalpages += node_present_pages(nid); + return CONSTRAINT_CPUSET; + } + return CONSTRAINT_NONE; +} + +static int oom_evaluate_task(struct task_struct *task, void *arg) +{ + struct oom_control *oc = arg; + long points; + + if (oom_unkillable_task(task)) + goto next; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc)) + goto next; + + /* + * This task already has access to memory reserves and is being killed. + * Don't allow any other task to have access to the reserves unless + * the task has MMF_OOM_SKIP because chances that it would release + * any memory is quite low. + */ + if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { + if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + goto next; + goto abort; + } + + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) { + points = LONG_MAX; + goto select; + } + + points = oom_badness(task, oc->totalpages); + if (points == LONG_MIN || points < oc->chosen_points) + goto next; + +select: + if (oc->chosen) + put_task_struct(oc->chosen); + get_task_struct(task); + oc->chosen = task; + oc->chosen_points = points; +next: + return 0; +abort: + if (oc->chosen) + put_task_struct(oc->chosen); + oc->chosen = (void *)-1UL; + return 1; +} + +/* + * Simple selection loop. We choose the process with the highest number of + * 'points'. In case scan was aborted, oc->chosen is set to -1. + */ +static void select_bad_process(struct oom_control *oc) +{ + oc->chosen_points = LONG_MIN; + + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); + else { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) + if (oom_evaluate_task(p, oc)) + break; + rcu_read_unlock(); + } +} + +static int dump_task(struct task_struct *p, void *arg) +{ + struct oom_control *oc = arg; + struct task_struct *task; + + if (oom_unkillable_task(p)) + return 0; + + /* p may not have freeable memory in nodemask */ + if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc)) + return 0; + + task = find_lock_task_mm(p); + if (!task) { + /* + * All of p's threads have already detached their mm's. There's + * no need to report them; they can't be oom killed anyway. + */ + return 0; + } + + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_SWAPENTS), + task->signal->oom_score_adj, task->comm); + task_unlock(task); + + return 0; +} + +/** + * dump_tasks - dump current memory state of all system tasks + * @oc: pointer to struct oom_control + * + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. + * State information includes task's pid, uid, tgid, vm size, rss, + * pgtables_bytes, swapents, oom_score_adj value, and name. + */ +static void dump_tasks(struct oom_control *oc) +{ + pr_info("Tasks state (memory values in pages):\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + + if (is_memcg_oom(oc)) + mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); + else { + struct task_struct *p; + + rcu_read_lock(); + for_each_process(p) + dump_task(p, oc); + rcu_read_unlock(); + } +} + +static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) +{ + /* one line summary of the oom killer context. */ + pr_info("oom-kill:constraint=%s,nodemask=%*pbl", + oom_constraint_text[oc->constraint], + nodemask_pr_args(oc->nodemask)); + cpuset_print_current_mems_allowed(); + mem_cgroup_print_oom_context(oc->memcg, victim); + pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid, + from_kuid(&init_user_ns, task_uid(victim))); +} + +static void dump_header(struct oom_control *oc, struct task_struct *p) +{ + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, + current->signal->oom_score_adj); + if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) + pr_warn("COMPACTION is disabled!!!\n"); + + dump_stack(); + if (is_memcg_oom(oc)) + mem_cgroup_print_oom_meminfo(oc->memcg); + else { + __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); + if (should_dump_unreclaim_slab()) + dump_unreclaimable_slab(); + } + if (sysctl_oom_dump_tasks) + dump_tasks(oc); + if (p) + dump_oom_summary(oc, p); +} + +/* + * Number of OOM victims in flight + */ +static atomic_t oom_victims = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); + +static bool oom_killer_disabled __read_mostly; + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +static struct task_struct *oom_reaper_list; +static DEFINE_SPINLOCK(oom_reaper_lock); + +static bool __oom_reap_task_mm(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + bool ret = true; + VMA_ITERATOR(vmi, mm, 0); + + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + + for_each_vma(vmi, vma) { + if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { + struct mmu_notifier_range range; + struct mmu_gather tlb; + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, mm, vma->vm_start, + vma->vm_end); + tlb_gather_mmu(&tlb, mm); + if (mmu_notifier_invalidate_range_start_nonblock(&range)) { + tlb_finish_mmu(&tlb); + ret = false; + continue; + } + unmap_page_range(&tlb, vma, range.start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); + } + } + + return ret; +} + +/* + * Reaps the address space of the give task. + * + * Returns true on success and false if none or part of the address space + * has been reclaimed and the caller should retry later. + */ +static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +{ + bool ret = true; + + if (!mmap_read_trylock(mm)) { + trace_skip_task_reaping(tsk->pid); + return false; + } + + /* + * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't + * work on the mm anymore. The check for MMF_OOM_SKIP must run + * under mmap_lock for reading because it serializes against the + * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). + */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + trace_skip_task_reaping(tsk->pid); + goto out_unlock; + } + + trace_start_task_reaping(tsk->pid); + + /* failed to reap part of the address space. Try again later */ + ret = __oom_reap_task_mm(mm); + if (!ret) + goto out_finish; + + pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + task_pid_nr(tsk), tsk->comm, + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES))); +out_finish: + trace_finish_task_reaping(tsk->pid); +out_unlock: + mmap_read_unlock(mm); + + return ret; +} + +#define MAX_OOM_REAP_RETRIES 10 +static void oom_reap_task(struct task_struct *tsk) +{ + int attempts = 0; + struct mm_struct *mm = tsk->signal->oom_mm; + + /* Retry the mmap_read_trylock(mm) a few times */ + while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) + schedule_timeout_idle(HZ/10); + + if (attempts <= MAX_OOM_REAP_RETRIES || + test_bit(MMF_OOM_SKIP, &mm->flags)) + goto done; + + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + sched_show_task(tsk); + debug_show_all_locks(); + +done: + tsk->oom_reaper_list = NULL; + + /* + * Hide this mm from OOM killer because it has been either reaped or + * somebody can't call mmap_write_unlock(mm). + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + + /* Drop a reference taken by queue_oom_reaper */ + put_task_struct(tsk); +} + +static int oom_reaper(void *unused) +{ + set_freezable(); + + while (true) { + struct task_struct *tsk = NULL; + + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + spin_lock_irq(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } + spin_unlock_irq(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); + } + + return 0; +} + +static void wake_oom_reaper(struct timer_list *timer) +{ + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags; + + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + } + + spin_lock_irqsave(&oom_reaper_lock, flags); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; + spin_unlock_irqrestore(&oom_reaper_lock, flags); + trace_wake_reaper(tsk->pid); + wake_up(&oom_reaper_wait); +} + +/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table vm_oom_kill_table[] = { + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_oom_kill_table); +#endif + return 0; +} +subsys_initcall(oom_init) +#else +static inline void queue_oom_reaper(struct task_struct *tsk) +{ +} +#endif /* CONFIG_MMU */ + +/** + * mark_oom_victim - mark the given task as OOM victim + * @tsk: task to mark + * + * Has to be called with oom_lock held and never after + * oom has been disabled already. + * + * tsk->mm has to be non NULL and caller has to guarantee it is stable (either + * under task_lock or operate on the current). + */ +static void mark_oom_victim(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + + WARN_ON(oom_killer_disabled); + /* OOM killer might race with memcg OOM */ + if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; + + /* oom_mm is bound to the signal struct life time. */ + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) + mmgrab(tsk->signal->oom_mm); + + /* + * Make sure that the task is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free + * any memory and livelock. freezing_slow_path will tell the freezer + * that TIF_MEMDIE tasks should be ignored. + */ + __thaw_task(tsk); + atomic_inc(&oom_victims); + trace_mark_victim(tsk->pid); +} + +/** + * exit_oom_victim - note the exit of an OOM victim + */ +void exit_oom_victim(void) +{ + clear_thread_flag(TIF_MEMDIE); + + if (!atomic_dec_return(&oom_victims)) + wake_up_all(&oom_victims_wait); +} + +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + oom_killer_disabled = false; + pr_info("OOM killer enabled.\n"); +} + +/** + * oom_killer_disable - disable OOM killer + * @timeout: maximum timeout to wait for oom victims in jiffies + * + * Forces all page allocations to fail rather than trigger OOM killer. + * Will block and wait until all OOM victims are killed or the given + * timeout expires. + * + * The function cannot be called when there are runnable user tasks because + * the userspace would see unexpected allocation failures as a result. Any + * new usage of this function should be consulted with MM people. + * + * Returns true if successful and false if the OOM killer cannot be + * disabled. + */ +bool oom_killer_disable(signed long timeout) +{ + signed long ret; + + /* + * Make sure to not race with an ongoing OOM killer. Check that the + * current is not killed (possibly due to sharing the victim's memory). + */ + if (mutex_lock_killable(&oom_lock)) + return false; + oom_killer_disabled = true; + mutex_unlock(&oom_lock); + + ret = wait_event_interruptible_timeout(oom_victims_wait, + !atomic_read(&oom_victims), timeout); + if (ret <= 0) { + oom_killer_enable(); + return false; + } + pr_info("OOM killer disabled.\n"); + + return true; +} + +static inline bool __task_will_free_mem(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + + /* + * A coredumping process may sleep for an extended period in + * coredump_task_exit(), so the oom killer cannot assume that + * the process will promptly exit and release memory. + */ + if (sig->core_state) + return false; + + if (sig->flags & SIGNAL_GROUP_EXIT) + return true; + + if (thread_group_empty(task) && (task->flags & PF_EXITING)) + return true; + + return false; +} + +/* + * Checks whether the given task is dying or exiting and likely to + * release its address space. This means that all threads and processes + * sharing the same mm have to be killed or exiting. + * Caller has to make sure that task->mm is stable (hold task_lock or + * it operates on the current). + */ +static bool task_will_free_mem(struct task_struct *task) +{ + struct mm_struct *mm = task->mm; + struct task_struct *p; + bool ret = true; + + /* + * Skip tasks without mm because it might have passed its exit_mm and + * exit_oom_victim. oom_reaper could have rescued that but do not rely + * on that for now. We can consider find_lock_task_mm in future. + */ + if (!mm) + return false; + + if (!__task_will_free_mem(task)) + return false; + + /* + * This task has already been drained by the oom reaper so there are + * only small chances it will free some more + */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) + return false; + + if (atomic_read(&mm->mm_users) <= 1) + return true; + + /* + * Make sure that all tasks which share the mm with the given tasks + * are dying as well to make sure that a) nobody pins its mm and + * b) the task is also reapable by the oom reaper. + */ + rcu_read_lock(); + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(task, p)) + continue; + ret = __task_will_free_mem(p); + if (!ret) + break; + } + rcu_read_unlock(); + + return ret; +} + +static void __oom_kill_process(struct task_struct *victim, const char *message) +{ + struct task_struct *p; + struct mm_struct *mm; + bool can_oom_reap = true; + + p = find_lock_task_mm(victim); + if (!p) { + pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n", + message, task_pid_nr(victim), victim->comm); + put_task_struct(victim); + return; + } else if (victim != p) { + get_task_struct(p); + put_task_struct(victim); + victim = p; + } + + /* Get a reference to safely compare mm after task_unlock(victim) */ + mm = victim->mm; + mmgrab(mm); + + /* Raise event before sending signal: task reaper must see this */ + count_vm_event(OOM_KILL); + memcg_memory_event_mm(mm, MEMCG_OOM_KILL); + + /* + * We should send SIGKILL before granting access to memory reserves + * in order to prevent the OOM victim from depleting the memory + * reserves from the user space under its control. + */ + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); + mark_oom_victim(victim); + pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", + message, task_pid_nr(victim), victim->comm, K(mm->total_vm), + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES)), + from_kuid(&init_user_ns, task_uid(victim)), + mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj); + task_unlock(victim); + + /* + * Kill all user processes sharing victim->mm in other thread groups, if + * any. They don't get access to memory reserves, though, to avoid + * depletion of all memory. This prevents mm->mmap_lock livelock when an + * oom killed thread cannot exit because it requires the semaphore and + * its contended by another thread trying to allocate memory itself. + * That thread will now get access to memory reserves since it has a + * pending fatal signal. + */ + rcu_read_lock(); + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, victim)) + continue; + if (is_global_init(p)) { + can_oom_reap = false; + set_bit(MMF_OOM_SKIP, &mm->flags); + pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", + task_pid_nr(victim), victim->comm, + task_pid_nr(p), p->comm); + continue; + } + /* + * No kthread_use_mm() user needs to read from the userspace so + * we are ok to reap it. + */ + if (unlikely(p->flags & PF_KTHREAD)) + continue; + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); + } + rcu_read_unlock(); + + if (can_oom_reap) + queue_oom_reaper(victim); + + mmdrop(mm); + put_task_struct(victim); +} +#undef K + +/* + * Kill provided task unless it's secured by setting + * oom_score_adj to OOM_SCORE_ADJ_MIN. + */ +static int oom_kill_memcg_member(struct task_struct *task, void *message) +{ + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && + !is_global_init(task)) { + get_task_struct(task); + __oom_kill_process(task, message); + } + return 0; +} + +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *victim = oc->chosen; + struct mem_cgroup *oom_group; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(victim); + if (task_will_free_mem(victim)) { + mark_oom_victim(victim); + queue_oom_reaper(victim); + task_unlock(victim); + put_task_struct(victim); + return; + } + task_unlock(victim); + + if (__ratelimit(&oom_rs)) + dump_header(oc, victim); + + /* + * Do we need to kill the entire memory cgroup? + * Or even one of the ancestor memory cgroups? + * Check this out before killing the victim task. + */ + oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + + __oom_kill_process(victim, message); + + /* + * If necessary, kill all tasks in the selected memory cgroup. + */ + if (oom_group) { + memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL); + mem_cgroup_print_oom_group(oom_group); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, + (void *)message); + mem_cgroup_put(oom_group); + } +} + +/* + * Determines whether the kernel must panic because of the panic_on_oom sysctl. + */ +static void check_panic_on_oom(struct oom_control *oc) +{ + if (likely(!sysctl_panic_on_oom)) + return; + if (sysctl_panic_on_oom != 2) { + /* + * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel + * does not panic for cpuset, mempolicy, or memcg allocation + * failures. + */ + if (oc->constraint != CONSTRAINT_NONE) + return; + } + /* Do not panic for oom kills triggered by sysrq */ + if (is_sysrq_oom(oc)) + return; + dump_header(oc, NULL); + panic("Out of memory: %s panic_on_oom is enabled\n", + sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); +} + +static BLOCKING_NOTIFIER_HEAD(oom_notify_list); + +int register_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_oom_notifier); + +int unregister_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_oom_notifier); + +/** + * out_of_memory - kill the "best" process when we run out of memory + * @oc: pointer to struct oom_control + * + * If we run out of memory, we have the choice between either + * killing a random task (bad), letting the system crash (worse) + * OR try to be smart about which process to kill. Note that we + * don't have to be perfect here, we just have to be good. + */ +bool out_of_memory(struct oom_control *oc) +{ + unsigned long freed = 0; + + if (oom_killer_disabled) + return false; + + if (!is_memcg_oom(oc)) { + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0 && !is_sysrq_oom(oc)) + /* Got some memory back in the last second. */ + return true; + } + + /* + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. + */ + if (task_will_free_mem(current)) { + mark_oom_victim(current); + queue_oom_reaper(current); + return true; + } + + /* + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least + * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to + * invoke the OOM killer even if it is a GFP_NOFS allocation. + */ + if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) + return true; + + /* + * Check if there were limitations on the allocation (only relevant for + * NUMA and memcg) that may require different handling. + */ + oc->constraint = constrained_alloc(oc); + if (oc->constraint != CONSTRAINT_MEMORY_POLICY) + oc->nodemask = NULL; + check_panic_on_oom(oc); + + if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && + current->mm && !oom_unkillable_task(current) && + oom_cpuset_eligible(current, oc) && + current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(current); + oc->chosen = current; + oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); + return true; + } + + select_bad_process(oc); + /* Found nothing?!?! */ + if (!oc->chosen) { + dump_header(oc, NULL); + pr_warn("Out of memory and no killable processes...\n"); + /* + * If we got here due to an actual allocation at the + * system level, we cannot survive this and will enter + * an endless loop in the allocator. Bail out now. + */ + if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) + panic("System is deadlocked on memory\n"); + } + if (oc->chosen && oc->chosen != (void *)-1UL) + oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : + "Memory cgroup out of memory"); + return !!oc->chosen; +} + +/* + * The pagefault handler calls here because some allocation has failed. We have + * to take care of the memcg OOM here because this is the only safe context without + * any locks held but let the oom killer triggered from the allocation context care + * about the global OOM. + */ +void pagefault_out_of_memory(void) +{ + static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (mem_cgroup_oom_synchronize(true)) + return; + + if (fatal_signal_pending(current)) + return; + + if (__ratelimit(&pfoom_rs)) + pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); +} + +SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU + struct mm_struct *mm = NULL; + struct task_struct *task; + struct task_struct *p; + unsigned int f_flags; + bool reap = false; + long ret = 0; + + if (flags) + return -EINVAL; + + task = pidfd_get_task(pidfd, &f_flags); + if (IS_ERR(task)) + return PTR_ERR(task); + + /* + * Make sure to choose a thread which still has a reference to mm + * during the group exit + */ + p = find_lock_task_mm(task); + if (!p) { + ret = -ESRCH; + goto put_task; + } + + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; + } + task_unlock(p); + + if (!reap) + goto drop_mm; + + if (mmap_read_lock_killable(mm)) { + ret = -EINTR; + goto drop_mm; + } + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + ret = -EAGAIN; + mmap_read_unlock(mm); + +drop_mm: + mmdrop(mm); +put_task: + put_task_struct(task); + return ret; +#else + return -ENOSYS; +#endif /* CONFIG_MMU */ +} diff --git a/mm/page-writeback.c b/mm/page-writeback.c new file mode 100644 index 000000000..de5f69921 --- /dev/null +++ b/mm/page-writeback.c @@ -0,0 +1,3084 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/page-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Contains functions related to writing back dirty pages at the + * address_space level. + * + * 10Apr2002 Andrew Morton + * Initial version + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Sleep at most 200ms at a time in balance_dirty_pages(). + */ +#define MAX_PAUSE max(HZ/5, 1) + +/* + * Try to keep balance_dirty_pages() call intervals higher than this many pages + * by raising pause time to max_pause when falls below it. + */ +#define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) + +/* + * Estimate write bandwidth at 200ms intervals. + */ +#define BANDWIDTH_INTERVAL max(HZ/5, 1) + +#define RATELIMIT_CALC_SHIFT 10 + +/* + * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited + * will look to see if it needs to force writeback or throttling. + */ +static long ratelimit_pages = 32; + +/* The following parameters are exported via /proc/sys/vm */ + +/* + * Start background writeback (via writeback threads) at this percentage + */ +static int dirty_background_ratio = 10; + +/* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of + * dirty_background_ratio * the amount of dirtyable memory + */ +static unsigned long dirty_background_bytes; + +/* + * free highmem will not be subtracted from the total free memory + * for calculating free ratios if vm_highmem_is_dirtyable is true + */ +static int vm_highmem_is_dirtyable; + +/* + * The generator of dirty data starts writeback at this percentage + */ +static int vm_dirty_ratio = 20; + +/* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of + * vm_dirty_ratio * the amount of dirtyable memory + */ +static unsigned long vm_dirty_bytes; + +/* + * The interval between `kupdate'-style writebacks + */ +unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ + +EXPORT_SYMBOL_GPL(dirty_writeback_interval); + +/* + * The longest time for which data is allowed to remain dirty + */ +unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ + +/* + * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: + * a full sync is triggered after this time elapses without any disk activity. + */ +int laptop_mode; + +EXPORT_SYMBOL(laptop_mode); + +/* End of sysctl-exported parameters */ + +struct wb_domain global_wb_domain; + +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; + + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; +}; + +/* + * Length of period for aging writeout fractions of bdis. This is an + * arbitrarily chosen number. The longer the period, the slower fractions will + * reflect changes in current writeout rate. + */ +#define VM_COMPLETIONS_PERIOD_LEN (3*HZ) + +#ifdef CONFIG_CGROUP_WRITEBACK + +#define GDTC_INIT(__wb) .wb = (__wb), \ + .dom = &global_wb_domain, \ + .wb_completions = &(__wb)->completions + +#define GDTC_INIT_NO_WB .dom = &global_wb_domain + +#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ + .dom = mem_cgroup_wb_domain(__wb), \ + .wb_completions = &(__wb)->memcg_completions, \ + .gdtc = __gdtc + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return mdtc->gdtc; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth); + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + unsigned long long min = wb->bdi->min_ratio; + unsigned long long max = wb->bdi->max_ratio; + + /* + * @wb may already be clean by the time control reaches here and + * the total may not include its bw. + */ + if (this_bw < tot_bw) { + if (min) { + min *= this_bw; + min = div64_ul(min, tot_bw); + } + if (max < 100) { + max *= this_bw; + max = div64_ul(max, tot_bw); + } + } + + *minp = min; + *maxp = max; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +#define GDTC_INIT(__wb) .wb = (__wb), \ + .wb_completions = &(__wb)->completions +#define GDTC_INIT_NO_WB +#define MDTC_INIT(__wb, __gdtc) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return false; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return &global_wb_domain; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return NULL; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + *minp = wb->bdi->min_ratio; + *maxp = wb->bdi->max_ratio; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effective number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + +/** + * node_dirtyable_memory - number of dirtyable pages in a node + * @pgdat: the node + * + * Return: the node's number of pages potentially available for dirty + * page cache. This is the base value for the per-node dirty limits. + */ +static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) +{ + unsigned long nr_pages = 0; + int z; + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + nr_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + nr_pages -= min(nr_pages, pgdat->totalreserve_pages); + + nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE); + nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE); + + return nr_pages; +} + +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + int i; + + for_each_node_state(node, N_HIGH_MEMORY) { + for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) { + struct zone *z; + unsigned long nr_pages; + + if (!is_highmem_idx(i)) + continue; + + z = &NODE_DATA(node)->node_zones[i]; + if (!populated_zone(z)) + continue; + + nr_pages = zone_page_state(z, NR_FREE_PAGES); + /* watch for underflows */ + nr_pages -= min(nr_pages, high_wmark_pages(z)); + nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE); + nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE); + x += nr_pages; + } + } + + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * global_dirtyable_memory - number of globally dirtyable pages + * + * Return: the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. + */ +static unsigned long global_dirtyable_memory(void) +{ + unsigned long x; + + x = global_zone_page_state(NR_FREE_PAGES); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + x -= min(x, totalreserve_pages); + + x += global_node_page_state(NR_INACTIVE_FILE); + x += global_node_page_state(NR_ACTIVE_FILE); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +/** + * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain + * @dtc: dirty_throttle_control of interest + * + * Calculate @dtc->thresh and ->bg_thresh considering + * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller + * must ensure that @dtc->avail is set before calling this function. The + * dirty limits will be lifted by 1/4 for real-time tasks. + */ +static void domain_dirty_limits(struct dirty_throttle_control *dtc) +{ + const unsigned long available_memory = dtc->avail; + struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); + unsigned long bytes = vm_dirty_bytes; + unsigned long bg_bytes = dirty_background_bytes; + /* convert ratios to per-PAGE_SIZE for higher precision */ + unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; + unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; + unsigned long thresh; + unsigned long bg_thresh; + struct task_struct *tsk; + + /* gdtc is !NULL iff @dtc is for memcg domain */ + if (gdtc) { + unsigned long global_avail = gdtc->avail; + + /* + * The byte settings can't be applied directly to memcg + * domains. Convert them to ratios by scaling against + * globally available memory. As the ratios are in + * per-PAGE_SIZE, they can be obtained by dividing bytes by + * number of pages. + */ + if (bytes) + ratio = min(DIV_ROUND_UP(bytes, global_avail), + PAGE_SIZE); + if (bg_bytes) + bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), + PAGE_SIZE); + bytes = bg_bytes = 0; + } + + if (bytes) + thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); + else + thresh = (ratio * available_memory) / PAGE_SIZE; + + if (bg_bytes) + bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); + else + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; + + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; + tsk = current; + if (rt_task(tsk)) { + bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; + thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; + } + dtc->thresh = thresh; + dtc->bg_thresh = bg_thresh; + + /* we should eventually report the domain in the TP */ + if (!gdtc) + trace_global_dirty_state(bg_thresh, thresh); +} + +/** + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * @pbackground: out parameter for bg_thresh + * @pdirty: out parameter for thresh + * + * Calculate bg_thresh and thresh for global_wb_domain. See + * domain_dirty_limits() for details. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + + gdtc.avail = global_dirtyable_memory(); + domain_dirty_limits(&gdtc); + + *pbackground = gdtc.bg_thresh; + *pdirty = gdtc.thresh; +} + +/** + * node_dirty_limit - maximum number of dirty pages allowed in a node + * @pgdat: the node + * + * Return: the maximum number of dirty pages allowed in a node, based + * on the node's dirtyable memory. + */ +static unsigned long node_dirty_limit(struct pglist_data *pgdat) +{ + unsigned long node_memory = node_dirtyable_memory(pgdat); + struct task_struct *tsk = current; + unsigned long dirty; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + node_memory / global_dirtyable_memory(); + else + dirty = vm_dirty_ratio * node_memory / 100; + + if (rt_task(tsk)) + dirty += dirty / 4; + + return dirty; +} + +/** + * node_dirty_ok - tells whether a node is within its dirty limits + * @pgdat: the node to check + * + * Return: %true when the dirty pages in @pgdat are within the node's + * dirty limit, %false if the limit is exceeded. + */ +bool node_dirty_ok(struct pglist_data *pgdat) +{ + unsigned long limit = node_dirty_limit(pgdat); + unsigned long nr_pages = 0; + + nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); + nr_pages += node_page_state(pgdat, NR_WRITEBACK); + + return nr_pages <= limit; +} + +#ifdef CONFIG_SYSCTL +static int dirty_background_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_bytes = 0; + return ret; +} + +static int dirty_background_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + dirty_background_ratio = 0; + return ret; +} + +static int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int old_ratio = vm_dirty_ratio; + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_ratio != old_ratio) { + writeback_set_ratelimit(); + vm_dirty_bytes = 0; + } + return ret; +} + +static int dirty_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned long old_bytes = vm_dirty_bytes; + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write && vm_dirty_bytes != old_bytes) { + writeback_set_ratelimit(); + vm_dirty_ratio = 0; + } + return ret; +} +#endif + +static unsigned long wp_next_time(unsigned long cur_time) +{ + cur_time += VM_COMPLETIONS_PERIOD_LEN; + /* 0 has a special meaning... */ + if (!cur_time) + return 1; + return cur_time; +} + +static void wb_domain_writeout_add(struct wb_domain *dom, + struct fprop_local_percpu *completions, + unsigned int max_prop_frac, long nr) +{ + __fprop_add_percpu_max(&dom->completions, completions, + max_prop_frac, nr); + /* First event after period switching was turned off? */ + if (unlikely(!dom->period_time)) { + /* + * We can race with other __bdi_writeout_inc calls here but + * it does not cause any harm since the resulting time when + * timer will fire and what is in writeout_period_time will be + * roughly the same. + */ + dom->period_time = wp_next_time(jiffies); + mod_timer(&dom->period_timer, dom->period_time); + } +} + +/* + * Increment @wb's writeout completion count and the global writeout + * completion count. Called from __folio_end_writeback(). + */ +static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr) +{ + struct wb_domain *cgdom; + + wb_stat_mod(wb, WB_WRITTEN, nr); + wb_domain_writeout_add(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac, nr); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_add(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac, nr); +} + +void wb_writeout_inc(struct bdi_writeback *wb) +{ + unsigned long flags; + + local_irq_save(flags); + __wb_writeout_add(wb, 1); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(wb_writeout_inc); + +/* + * On idle system, we can be called long after we scheduled because we use + * deferred timers so count with missed periods. + */ +static void writeout_period(struct timer_list *t) +{ + struct wb_domain *dom = from_timer(dom, t, period_timer); + int miss_periods = (jiffies - dom->period_time) / + VM_COMPLETIONS_PERIOD_LEN; + + if (fprop_new_period(&dom->completions, miss_periods + 1)) { + dom->period_time = wp_next_time(dom->period_time + + miss_periods * VM_COMPLETIONS_PERIOD_LEN); + mod_timer(&dom->period_timer, dom->period_time); + } else { + /* + * Aging has zeroed all fractions. Stop wasting CPU on period + * updates. + */ + dom->period_time = 0; + } +} + +int wb_domain_init(struct wb_domain *dom, gfp_t gfp) +{ + memset(dom, 0, sizeof(*dom)); + + spin_lock_init(&dom->lock); + + timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); + + dom->dirty_limit_tstamp = jiffies; + + return fprop_global_init(&dom->completions, gfp); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + del_timer_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + +/* + * bdi_min_ratio keeps the sum of the minimum dirty shares of all + * registered backing devices, which, for obvious reasons, can not + * exceed 100%. + */ +static unsigned int bdi_min_ratio; + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + unsigned int delta; + int ret = 0; + + spin_lock_bh(&bdi_lock); + if (min_ratio > bdi->max_ratio) { + ret = -EINVAL; + } else { + if (min_ratio < bdi->min_ratio) { + delta = bdi->min_ratio - min_ratio; + bdi_min_ratio -= delta; + bdi->min_ratio = min_ratio; + } else { + delta = min_ratio - bdi->min_ratio; + if (bdi_min_ratio + delta < 100) { + bdi_min_ratio += delta; + bdi->min_ratio = min_ratio; + } else { + ret = -EINVAL; + } + } + } + spin_unlock_bh(&bdi_lock); + + return ret; +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +{ + int ret = 0; + + if (max_ratio > 100) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (bdi->min_ratio > max_ratio) { + ret = -EINVAL; + } else { + bdi->max_ratio = max_ratio; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; + } + spin_unlock_bh(&bdi_lock); + + return ret; +} +EXPORT_SYMBOL(bdi_set_max_ratio); + +static unsigned long dirty_freerun_ceiling(unsigned long thresh, + unsigned long bg_thresh) +{ + return (thresh + bg_thresh) / 2; +} + +static unsigned long hard_dirty_limit(struct wb_domain *dom, + unsigned long thresh) +{ + return max(thresh, dom->dirty_limit); +} + +/* + * Memory which can be further allocated to a memcg domain is capped by + * system-wide clean memory excluding the amount being used in the domain. + */ +static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, + unsigned long filepages, unsigned long headroom) +{ + struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); + unsigned long clean = filepages - min(filepages, mdtc->dirty); + unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); + unsigned long other_clean = global_clean - min(global_clean, clean); + + mdtc->avail = filepages + min(headroom, other_clean); +} + +/** + * __wb_calc_thresh - @wb's share of dirty throttling threshold + * @dtc: dirty_throttle_context of interest + * + * Note that balance_dirty_pages() will only seriously take it as a hard limit + * when sleeping max_pause per page is not enough to keep the dirty pages under + * control. For example, when the device is completely stalled due to some error + * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. + * In the other normal situations, it acts more gently by throttling the tasks + * more (rather than completely block them) when the wb dirty pages go high. + * + * It allocates high/low dirty limits to fast/slow devices, in order to prevent + * - starving fast devices + * - piling up dirty pages (that will take long time to sync) on slow devices + * + * The wb's share of dirty limit will be adapting to its throughput and + * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + * + * Return: @wb's dirty limit in pages. The term "dirty" in the context of + * dirty balancing includes all PG_dirty and PG_writeback pages. + */ +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) +{ + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + u64 wb_thresh; + unsigned long numerator, denominator; + unsigned long wb_min_ratio, wb_max_ratio; + + /* + * Calculate this BDI's share of the thresh ratio. + */ + fprop_fraction_percpu(&dom->completions, dtc->wb_completions, + &numerator, &denominator); + + wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh *= numerator; + wb_thresh = div64_ul(wb_thresh, denominator); + + wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); + + wb_thresh += (thresh * wb_min_ratio) / 100; + if (wb_thresh > (thresh * wb_max_ratio) / 100) + wb_thresh = thresh * wb_max_ratio / 100; + + return wb_thresh; +} + +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb), + .thresh = thresh }; + return __wb_calc_thresh(&gdtc); +} + +/* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + (limit - setpoint) | 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); +} + +/* + * Dirty position control. + * + * (o) global/bdi setpoints + * + * We want the dirty pages be balanced around the global/wb setpoints. + * When the number of dirty pages is higher/lower than the setpoint, the + * dirty position control ratio (and hence task dirty ratelimit) will be + * decreased/increased to bring the dirty pages back to the setpoint. + * + * pos_ratio = 1 << RATELIMIT_CALC_SHIFT + * + * if (dirty < setpoint) scale up pos_ratio + * if (dirty > setpoint) scale down pos_ratio + * + * if (wb_dirty < wb_setpoint) scale up pos_ratio + * if (wb_dirty > wb_setpoint) scale down pos_ratio + * + * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT + * + * (o) global control line + * + * ^ pos_ratio + * | + * | |<===== global dirty control scope ======>| + * 2.0 * * * * * * * + * | .* + * | . * + * | . * + * | . * + * | . * + * | . * + * 1.0 ................................* + * | . . * + * | . . * + * | . . * + * | . . * + * | . . * + * 0 +------------.------------------.----------------------*-------------> + * freerun^ setpoint^ limit^ dirty pages + * + * (o) wb control line + * + * ^ pos_ratio + * | + * | * + * | * + * | * + * | * + * | * |<=========== span ============>| + * 1.0 .......................* + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * | . * + * 1/4 ...............................................* * * * * * * * * * * * + * | . . + * | . . + * | . . + * 0 +----------------------.-------------------------------.-------------> + * wb_setpoint^ x_intercept^ + * + * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can + * be smoothly throttled down to normal if it starts high in situations like + * - start writing to a slow SD card and a fast disk at the same time. The SD + * card's wb_dirty may rush to many times higher than wb_setpoint. + * - the wb dirty thresh drops quickly due to change of JBOD workload + */ +static void wb_position_ratio(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth); + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long wb_thresh = dtc->wb_thresh; + unsigned long x_intercept; + unsigned long setpoint; /* dirty pages' target balance point */ + unsigned long wb_setpoint; + unsigned long span; + long long pos_ratio; /* for scaling up/down the rate limit */ + long x; + + dtc->pos_ratio = 0; + + if (unlikely(dtc->dirty >= limit)) + return; + + /* + * global setpoint + * + * See comment for pos_ratio_polynom(). + */ + setpoint = (freerun + limit) / 2; + pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks wb counters + * against wb limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. Without strictlimit feature, fuse writeback may + * consume arbitrary amount of RAM because it is accounted in + * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". + * + * Here, in wb_position_ratio(), we calculate pos_ratio based on + * two values: wb_dirty and wb_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * about ~6K pages (as the average of background and throttle wb + * limits). The 3rd order polynomial will provide positive feedback if + * wb_dirty is under wb_setpoint and vice versa. + * + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit wb + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). + */ + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long wb_pos_ratio; + + if (dtc->wb_dirty < 8) { + dtc->pos_ratio = min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + return; + } + + if (dtc->wb_dirty >= wb_thresh) + return; + + wb_setpoint = dirty_freerun_ceiling(wb_thresh, + dtc->wb_bg_thresh); + + if (wb_setpoint == 0 || wb_setpoint == wb_thresh) + return; + + wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, + wb_thresh); + + /* + * Typically, for strictlimit case, wb_setpoint << setpoint + * and pos_ratio >> wb_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on wb counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * wb's) while given strictlimit wb is below limit. + * + * "pos_ratio * wb_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit wb + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and wb is well below wb + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); + return; + } + + /* + * We have computed basic pos_ratio above based on global situation. If + * the wb is over/under its share of dirty pages, we want to scale + * pos_ratio further down/up. That is done by the following mechanism. + */ + + /* + * wb setpoint + * + * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) + * + * x_intercept - wb_dirty + * := -------------------------- + * x_intercept - wb_setpoint + * + * The main wb control line is a linear function that subjects to + * + * (1) f(wb_setpoint) = 1.0 + * (2) k = - 1 / (8 * write_bw) (in single wb case) + * or equally: x_intercept = wb_setpoint + 8 * write_bw + * + * For single wb case, the dirty pages are observed to fluctuate + * regularly within range + * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] + * for various filesystems, where (2) can yield in a reasonable 12.5% + * fluctuation range for pos_ratio. + * + * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its + * own size, so move the slope over accordingly and choose a slope that + * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. + */ + if (unlikely(wb_thresh > dtc->thresh)) + wb_thresh = dtc->thresh; + /* + * It's very possible that wb_thresh is close to 0 not because the + * device is slow, but that it has remained inactive for long time. + * Honour such devices a reasonable good (hopefully IO efficient) + * threshold, so that the occasional writes won't be blocked and active + * writes can rampup the threshold quickly. + */ + wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); + /* + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh + */ + x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; + /* + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. + * + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh + */ + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; + + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), + (x_intercept - wb_setpoint) | 1); + } else + pos_ratio /= 4; + + /* + * wb reserve area, safeguard against dirty pool underrun and disk idle + * It may push the desired control point of global dirty pages higher + * than setpoint. + */ + x_intercept = wb_thresh / 2; + if (dtc->wb_dirty < x_intercept) { + if (dtc->wb_dirty > x_intercept / 8) + pos_ratio = div_u64(pos_ratio * x_intercept, + dtc->wb_dirty); + else + pos_ratio *= 8; + } + + dtc->pos_ratio = pos_ratio; +} + +static void wb_update_write_bandwidth(struct bdi_writeback *wb, + unsigned long elapsed, + unsigned long written) +{ + const unsigned long period = roundup_pow_of_two(3 * HZ); + unsigned long avg = wb->avg_write_bandwidth; + unsigned long old = wb->write_bandwidth; + u64 bw; + + /* + * bw = written * HZ / elapsed + * + * bw * elapsed + write_bandwidth * (period - elapsed) + * write_bandwidth = --------------------------------------------------- + * period + * + * @written may have decreased due to folio_account_redirty(). + * Avoid underflowing @bw calculation. + */ + bw = written - min(written, wb->written_stamp); + bw *= HZ; + if (unlikely(elapsed > period)) { + bw = div64_ul(bw, elapsed); + avg = bw; + goto out; + } + bw += (u64)wb->write_bandwidth * (period - elapsed); + bw >>= ilog2(period); + + /* + * one more level of smoothing, for filtering out sudden spikes + */ + if (avg > old && old >= (unsigned long)bw) + avg -= (avg - old) >> 3; + + if (avg < old && old <= (unsigned long)bw) + avg += (old - avg) >> 3; + +out: + /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ + avg = max(avg, 1LU); + if (wb_has_dirty_io(wb)) { + long delta = avg - wb->avg_write_bandwidth; + WARN_ON_ONCE(atomic_long_add_return(delta, + &wb->bdi->tot_write_bandwidth) <= 0); + } + wb->write_bandwidth = bw; + WRITE_ONCE(wb->avg_write_bandwidth, avg); +} + +static void update_dirty_limit(struct dirty_throttle_control *dtc) +{ + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + unsigned long limit = dom->dirty_limit; + + /* + * Follow up in one step. + */ + if (limit < thresh) { + limit = thresh; + goto update; + } + + /* + * Follow down slowly. Use the higher one as the target, because thresh + * may drop below dirty. This is exactly the reason to introduce + * dom->dirty_limit which is guaranteed to lie above the dirty pages. + */ + thresh = max(thresh, dtc->dirty); + if (limit > thresh) { + limit -= (limit - thresh) >> 5; + goto update; + } + return; +update: + dom->dirty_limit = limit; +} + +static void domain_update_dirty_limit(struct dirty_throttle_control *dtc, + unsigned long now) +{ + struct wb_domain *dom = dtc_dom(dtc); + + /* + * check locklessly first to optimize away locking for the most time + */ + if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) + return; + + spin_lock(&dom->lock); + if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { + update_dirty_limit(dtc); + dom->dirty_limit_tstamp = now; + } + spin_unlock(&dom->lock); +} + +/* + * Maintain wb->dirty_ratelimit, the base dirty throttle rate. + * + * Normal wb tasks will be curbed at or below it in long term. + * Obviously it should be around (write_bw / N) when there are N dd tasks. + */ +static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, + unsigned long dirtied, + unsigned long elapsed) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long dirty = dtc->dirty; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long setpoint = (freerun + limit) / 2; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long dirty_ratelimit = wb->dirty_ratelimit; + unsigned long dirty_rate; + unsigned long task_ratelimit; + unsigned long balanced_dirty_ratelimit; + unsigned long step; + unsigned long x; + unsigned long shift; + + /* + * The dirty rate will match the writeout rate in long term, except + * when dirty pages are truncated by userspace or re-dirtied by FS. + */ + dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; + + /* + * task_ratelimit reflects each dd's dirty rate for the past 200ms. + */ + task_ratelimit = (u64)dirty_ratelimit * + dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; + task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ + + /* + * A linear estimation of the "balanced" throttle rate. The theory is, + * if there are N dd tasks, each throttled at task_ratelimit, the wb's + * dirty_rate will be measured to be (N * task_ratelimit). So the below + * formula will yield the balanced rate limit (write_bw / N). + * + * Note that the expanded form is not a pure rate feedback: + * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) (1) + * but also takes pos_ratio into account: + * rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio (2) + * + * (1) is not realistic because pos_ratio also takes part in balancing + * the dirty rate. Consider the state + * pos_ratio = 0.5 (3) + * rate = 2 * (write_bw / N) (4) + * If (1) is used, it will stuck in that state! Because each dd will + * be throttled at + * task_ratelimit = pos_ratio * rate = (write_bw / N) (5) + * yielding + * dirty_rate = N * task_ratelimit = write_bw (6) + * put (6) into (1) we get + * rate_(i+1) = rate_(i) (7) + * + * So we end up using (2) to always keep + * rate_(i+1) ~= (write_bw / N) (8) + * regardless of the value of pos_ratio. As long as (8) is satisfied, + * pos_ratio is able to drive itself to 1.0, which is not only where + * the dirty count meet the setpoint, but also where the slope of + * pos_ratio is most flat and hence task_ratelimit is least fluctuated. + */ + balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw, + dirty_rate | 1); + /* + * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw + */ + if (unlikely(balanced_dirty_ratelimit > write_bw)) + balanced_dirty_ratelimit = write_bw; + + /* + * We could safely do this and return immediately: + * + * wb->dirty_ratelimit = balanced_dirty_ratelimit; + * + * However to get a more stable dirty_ratelimit, the below elaborated + * code makes use of task_ratelimit to filter out singular points and + * limit the step size. + * + * The below code essentially only uses the relative value of + * + * task_ratelimit - dirty_ratelimit + * = (pos_ratio - 1) * dirty_ratelimit + * + * which reflects the direction and size of dirty position error. + */ + + /* + * dirty_ratelimit will follow balanced_dirty_ratelimit iff + * task_ratelimit is on the same side of dirty_ratelimit, too. + * For example, when + * - dirty_ratelimit > balanced_dirty_ratelimit + * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint) + * lowering dirty_ratelimit will help meet both the position and rate + * control targets. Otherwise, don't update dirty_ratelimit if it will + * only help meet the rate target. After all, what the users ultimately + * feel and care are stable dirty rate and small position error. + * + * |task_ratelimit - dirty_ratelimit| is used to limit the step size + * and filter out the singular points of balanced_dirty_ratelimit. Which + * keeps jumping around randomly and can even leap far away at times + * due to the small 200ms estimation period of dirty_rate (we want to + * keep that period small to reduce time lags). + */ + step = 0; + + /* + * For strictlimit case, calculations above were based on wb counters + * and limits (starting from pos_ratio = wb_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use wb_dirty as + * "dirty" and wb_setpoint as "setpoint". + * + * We rampup dirty_ratelimit forcibly if wb_dirty is low because + * it's possible that wb_thresh is close to zero due to inactivity + * of backing device. + */ + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = dtc->wb_dirty; + if (dtc->wb_dirty < 8) + setpoint = dtc->wb_dirty + 1; + else + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; + } + + if (dirty < setpoint) { + x = min3(wb->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); + if (dirty_ratelimit < x) + step = x - dirty_ratelimit; + } else { + x = max3(wb->balanced_dirty_ratelimit, + balanced_dirty_ratelimit, task_ratelimit); + if (dirty_ratelimit > x) + step = dirty_ratelimit - x; + } + + /* + * Don't pursue 100% rate matching. It's impossible since the balanced + * rate itself is constantly fluctuating. So decrease the track speed + * when it gets close to the target. Helps eliminate pointless tremors. + */ + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; + + if (dirty_ratelimit < balanced_dirty_ratelimit) + dirty_ratelimit += step; + else + dirty_ratelimit -= step; + + WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL)); + wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); +} + +static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, + struct dirty_throttle_control *mdtc, + bool update_ratelimit) +{ + struct bdi_writeback *wb = gdtc->wb; + unsigned long now = jiffies; + unsigned long elapsed; + unsigned long dirtied; + unsigned long written; + + spin_lock(&wb->list_lock); + + /* + * Lockless checks for elapsed time are racy and delayed update after + * IO completion doesn't do it at all (to make sure written pages are + * accounted reasonably quickly). Make sure elapsed >= 1 to avoid + * division errors. + */ + elapsed = max(now - wb->bw_time_stamp, 1UL); + dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); + written = percpu_counter_read(&wb->stat[WB_WRITTEN]); + + if (update_ratelimit) { + domain_update_dirty_limit(gdtc, now); + wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); + + /* + * @mdtc is always NULL if !CGROUP_WRITEBACK but the + * compiler has no way to figure that out. Help it. + */ + if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { + domain_update_dirty_limit(mdtc, now); + wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); + } + } + wb_update_write_bandwidth(wb, elapsed, written); + + wb->dirtied_stamp = dirtied; + wb->written_stamp = written; + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); +} + +void wb_update_bandwidth(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + __wb_update_bandwidth(&gdtc, NULL, false); +} + +/* Interval after which we consider wb idle and don't estimate bandwidth */ +#define WB_BANDWIDTH_IDLE_JIF (HZ) + +static void wb_bandwidth_estimate_start(struct bdi_writeback *wb) +{ + unsigned long now = jiffies; + unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp); + + if (elapsed > WB_BANDWIDTH_IDLE_JIF && + !atomic_read(&wb->writeback_inodes)) { + spin_lock(&wb->list_lock); + wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED); + wb->written_stamp = wb_stat(wb, WB_WRITTEN); + WRITE_ONCE(wb->bw_time_stamp, now); + spin_unlock(&wb->list_lock); + } +} + +/* + * After a task dirtied this many pages, balance_dirty_pages_ratelimited() + * will look to see if it needs to start dirty throttling. + * + * If dirty_poll_interval is too low, big NUMA machines will call the expensive + * global_zone_page_state() too often. So scale it near-sqrt to the safety margin + * (the number of pages we may dirty without exceeding the dirty limits). + */ +static unsigned long dirty_poll_interval(unsigned long dirty, + unsigned long thresh) +{ + if (thresh > dirty) + return 1UL << (ilog2(thresh - dirty) >> 1); + + return 1; +} + +static unsigned long wb_max_pause(struct bdi_writeback *wb, + unsigned long wb_dirty) +{ + unsigned long bw = READ_ONCE(wb->avg_write_bandwidth); + unsigned long t; + + /* + * Limit pause time for small memory systems. If sleeping for too long + * time, a small pool of dirty/writeback pages may go empty and disk go + * idle. + * + * 8 serves as the safety ratio. + */ + t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t++; + + return min_t(unsigned long, t, MAX_PAUSE); +} + +static long wb_min_pause(struct bdi_writeback *wb, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) +{ + long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth)); + long lo = ilog2(READ_ONCE(wb->dirty_ratelimit)); + long t; /* target pause */ + long pause; /* estimated next pause */ + int pages; /* target nr_dirtied_pause */ + + /* target for 10ms pause on 1-dd case */ + t = max(1, HZ / 100); + + /* + * Scale up pause time for concurrent dirtiers in order to reduce CPU + * overheads. + * + * (N * 10ms) on 2^N concurrent tasks. + */ + if (hi > lo) + t += (hi - lo) * (10 * HZ) / 1024; + + /* + * This is a bit convoluted. We try to base the next nr_dirtied_pause + * on the much more stable dirty_ratelimit. However the next pause time + * will be computed based on task_ratelimit and the two rate limits may + * depart considerably at some time. Especially if task_ratelimit goes + * below dirty_ratelimit/2 and the target pause is max_pause, the next + * pause time will be max_pause*2 _trimmed down_ to max_pause. As a + * result task_ratelimit won't be executed faithfully, which could + * eventually bring down dirty_ratelimit. + * + * We apply two rules to fix it up: + * 1) try to estimate the next pause time and if necessary, use a lower + * nr_dirtied_pause so as not to exceed max_pause. When this happens, + * nr_dirtied_pause will be "dancing" with task_ratelimit. + * 2) limit the target pause time to max_pause/2, so that the normal + * small fluctuations of task_ratelimit won't trigger rule (1) and + * nr_dirtied_pause will remain as stable as dirty_ratelimit. + */ + t = min(t, 1 + max_pause / 2); + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + + /* + * Tiny nr_dirtied_pause is found to hurt I/O performance in the test + * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}. + * When the 16 consecutive reads are often interrupted by some dirty + * throttling pause during the async writes, cfq will go into idles + * (deadline is fine). So push nr_dirtied_pause as high as possible + * until reaches DIRTY_POLL_THRESH=32 pages. + */ + if (pages < DIRTY_POLL_THRESH) { + t = max_pause; + pages = dirty_ratelimit * t / roundup_pow_of_two(HZ); + if (pages > DIRTY_POLL_THRESH) { + pages = DIRTY_POLL_THRESH; + t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit; + } + } + + pause = HZ * pages / (task_ratelimit + 1); + if (pause > max_pause) { + t = max_pause; + pages = task_ratelimit * t / roundup_pow_of_two(HZ); + } + + *nr_dirtied_pause = pages; + /* + * The minimal pause time will normally be half the target pause time. + */ + return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; +} + +static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long wb_reclaimable; + + /* + * wb_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, wb_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (wb_dirty >> wb_thresh) either because + * wb_dirty starts high, or because wb_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until wb_dirty drops under + * wb_thresh. Instead the auxiliary wb control line in + * wb_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down wb_dirty. + */ + dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_bg_thresh = dtc->thresh ? + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (dtc->wb_thresh < 2 * wb_stat_error()) { + wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); + } else { + wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); + } +} + +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force + * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. + */ +static int balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, unsigned int flags) +{ + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + struct dirty_throttle_control *sdtc; + unsigned long nr_reclaimable; /* = file_dirty */ + long period; + long pause; + long max_pause; + long min_pause; + int nr_dirtied_pause; + bool dirty_exceeded = false; + unsigned long task_ratelimit; + unsigned long dirty_ratelimit; + struct backing_dev_info *bdi = wb->bdi; + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; + unsigned long start_time = jiffies; + int ret = 0; + + for (;;) { + unsigned long now = jiffies; + unsigned long dirty, thresh, bg_thresh; + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ + unsigned long m_thresh = 0; + unsigned long m_bg_thresh = 0; + + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); + + domain_dirty_limits(gdtc); + + if (unlikely(strictlimit)) { + wb_dirty_limits(gdtc); + + dirty = gdtc->wb_dirty; + thresh = gdtc->wb_thresh; + bg_thresh = gdtc->wb_bg_thresh; + } else { + dirty = gdtc->dirty; + thresh = gdtc->thresh; + bg_thresh = gdtc->bg_thresh; + } + + if (mdtc) { + unsigned long filepages, headroom, writeback; + + /* + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. + */ + mem_cgroup_wb_stats(wb, &filepages, &headroom, + &mdtc->dirty, &writeback); + mdtc->dirty += writeback; + mdtc_calc_avail(mdtc, filepages, headroom); + + domain_dirty_limits(mdtc); + + if (unlikely(strictlimit)) { + wb_dirty_limits(mdtc); + m_dirty = mdtc->wb_dirty; + m_thresh = mdtc->wb_thresh; + m_bg_thresh = mdtc->wb_bg_thresh; + } else { + m_dirty = mdtc->dirty; + m_thresh = mdtc->thresh; + m_bg_thresh = mdtc->bg_thresh; + } + } + + /* + * In laptop mode, we wait until hitting the higher threshold + * before starting background writeout, and then write out all + * the way down to the lower threshold. So slow writers cause + * minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + !writeback_in_progress(wb)) + wb_start_background_writeback(wb); + + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the wb limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the wb counters + * and limits. Small writeouts when the wb limits are ramping + * up are the price we consciously pay for strictlimit-ing. + * + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. + */ + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && + (!mdtc || + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; + + current->dirty_paused_when = now; + current->nr_dirtied = 0; + if (mdtc) + m_intv = dirty_poll_interval(m_dirty, m_thresh); + current->nr_dirtied_pause = min(intv, m_intv); + break; + } + + /* Start writeback even when in laptop mode */ + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + + mem_cgroup_flush_foreign(wb); + + /* + * Calculate global domain's pos_ratio and select the + * global dtc by default. + */ + if (!strictlimit) { + wb_dirty_limits(gdtc); + + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && + ((gdtc->dirty > gdtc->thresh) || strictlimit); + + wb_position_ratio(gdtc); + sdtc = gdtc; + + if (mdtc) { + /* + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. + */ + if (!strictlimit) { + wb_dirty_limits(mdtc); + + if ((current->flags & PF_LOCAL_THROTTLE) && + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. + */ + goto free_running; + } + dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && + ((mdtc->dirty > mdtc->thresh) || strictlimit); + + wb_position_ratio(mdtc); + if (mdtc->pos_ratio < gdtc->pos_ratio) + sdtc = mdtc; + } + + if (dirty_exceeded != wb->dirty_exceeded) + wb->dirty_exceeded = dirty_exceeded; + + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + __wb_update_bandwidth(gdtc, mdtc, true); + + /* throttle according to the chosen dtc */ + dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit); + task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> + RATELIMIT_CALC_SHIFT; + max_pause = wb_max_pause(wb, sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); + + if (unlikely(task_ratelimit == 0)) { + period = max_pause; + pause = max_pause; + goto pause; + } + period = HZ * pages_dirtied / task_ratelimit; + pause = period; + if (current->dirty_paused_when) + pause -= now - current->dirty_paused_when; + /* + * For less than 1s think time (ext3/4 may block the dirtier + * for up to 800ms from time to time on 1-HDD; so does xfs, + * however at much less frequency), try to compensate it in + * future periods by updating the virtual time; otherwise just + * do a reset, as it may be a light dirtier. + */ + if (pause < min_pause) { + trace_balance_dirty_pages(wb, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, + dirty_ratelimit, + task_ratelimit, + pages_dirtied, + period, + min(pause, 0L), + start_time); + if (pause < -HZ) { + current->dirty_paused_when = now; + current->nr_dirtied = 0; + } else if (period) { + current->dirty_paused_when += period; + current->nr_dirtied = 0; + } else if (current->nr_dirtied_pause <= pages_dirtied) + current->nr_dirtied_pause += pages_dirtied; + break; + } + if (unlikely(pause > max_pause)) { + /* for occasional dropped task_ratelimit */ + now += min(pause - max_pause, max_pause); + pause = max_pause; + } + +pause: + trace_balance_dirty_pages(wb, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, + dirty_ratelimit, + task_ratelimit, + pages_dirtied, + period, + pause, + start_time); + if (flags & BDP_ASYNC) { + ret = -EAGAIN; + break; + } + __set_current_state(TASK_KILLABLE); + wb->dirty_sleep = now; + io_schedule_timeout(pause); + + current->dirty_paused_when = now + pause; + current->nr_dirtied = 0; + current->nr_dirtied_pause = nr_dirtied_pause; + + /* + * This is typically equal to (dirty < thresh) and can also + * keep "1000+ dd on a slow USB stick" under control. + */ + if (task_ratelimit) + break; + + /* + * In the case of an unresponsive NFS server and the NFS dirty + * pages exceeds dirty_thresh, give the other good wb's a pipe + * to go through, so that tasks on them still remain responsive. + * + * In theory 1 page is enough to keep the consumer-producer + * pipe going: the flusher cleans 1 page => the task dirties 1 + * more page. However wb_dirty has accounting errors. So use + * the larger and more IO friendly wb_stat_error. + */ + if (sdtc->wb_dirty <= wb_stat_error()) + break; + + if (fatal_signal_pending(current)) + break; + } + return ret; +} + +static DEFINE_PER_CPU(int, bdp_ratelimits); + +/* + * Normal tasks are throttled by + * loop { + * dirty tsk->nr_dirtied_pause pages; + * take a snap in balance_dirty_pages(); + * } + * However there is a worst case. If every task exit immediately when dirtied + * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be + * called to throttle the page dirties. The solution is to save the not yet + * throttled page dirties in dirty_throttle_leaks on task exit and charge them + * randomly into the running tasks. This works well for the above worst case, + * as the new task will pick up and accumulate the old task's leaked dirty + * count and eventually get throttled. + */ +DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; + +/** + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. + * @mapping: address_space which was dirtied. + * @flags: BDP flags. + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * See balance_dirty_pages_ratelimited() for details. + * + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to + * indicate that memory is out of balance and the caller must wait + * for I/O to complete. Otherwise, it will return 0 to indicate + * that either memory was already in balance, or it was able to sleep + * until the amount of dirty memory returned to balance. + */ +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags) +{ + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + int ratelimit; + int ret = 0; + int *p; + + if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) + return ret; + + if (inode_cgwb_enabled(inode)) + wb = wb_get_create_current(bdi, GFP_KERNEL); + if (!wb) + wb = &bdi->wb; + + ratelimit = current->nr_dirtied_pause; + if (wb->dirty_exceeded) + ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); + + preempt_disable(); + /* + * This prevents one CPU to accumulate too many dirtied pages without + * calling into balance_dirty_pages(), which can happen when there are + * 1000+ tasks, all of them start dirtying pages at exactly the same + * time, hence all honoured too large initial task->nr_dirtied_pause. + */ + p = this_cpu_ptr(&bdp_ratelimits); + if (unlikely(current->nr_dirtied >= ratelimit)) + *p = 0; + else if (unlikely(*p >= ratelimit_pages)) { + *p = 0; + ratelimit = 0; + } + /* + * Pick up the dirtied pages by the exited tasks. This avoids lots of + * short-lived tasks (eg. gcc invocations in a kernel build) escaping + * the dirty throttling and livelock other long-run dirtiers. + */ + p = this_cpu_ptr(&dirty_throttle_leaks); + if (*p > 0 && current->nr_dirtied < ratelimit) { + unsigned long nr_pages_dirtied; + nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); + *p -= nr_pages_dirtied; + current->nr_dirtied += nr_pages_dirtied; + } + preempt_enable(); + + if (unlikely(current->nr_dirtied >= ratelimit)) + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); + + wb_put(wb); + return ret; +} +EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags); + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state. + * @mapping: address_space which was dirtied. + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + balance_dirty_pages_ratelimited_flags(mapping, 0); +} +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); + +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. + * + * Return: %true if writeback should continue. + */ +bool wb_over_bg_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + unsigned long reclaimable; + unsigned long thresh; + + /* + * Similar to balance_dirty_pages() but ignores pages being written + * as we're trying to decide whether to put more under writeback. + */ + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); + domain_dirty_limits(gdtc); + + if (gdtc->dirty > gdtc->bg_thresh) + return true; + + thresh = wb_calc_thresh(gdtc->wb, gdtc->bg_thresh); + if (thresh < 2 * wb_stat_error()) + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + else + reclaimable = wb_stat(wb, WB_RECLAIMABLE); + + if (reclaimable > thresh) + return true; + + if (mdtc) { + unsigned long filepages, headroom, writeback; + + mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, + &writeback); + mdtc_calc_avail(mdtc, filepages, headroom); + domain_dirty_limits(mdtc); /* ditto, ignore writeback */ + + if (mdtc->dirty > mdtc->bg_thresh) + return true; + + thresh = wb_calc_thresh(mdtc->wb, mdtc->bg_thresh); + if (thresh < 2 * wb_stat_error()) + reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + else + reclaimable = wb_stat(wb, WB_RECLAIMABLE); + + if (reclaimable > thresh) + return true; + } + + return false; +} + +#ifdef CONFIG_SYSCTL +/* + * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs + */ +static int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + unsigned int old_interval = dirty_writeback_interval; + int ret; + + ret = proc_dointvec(table, write, buffer, length, ppos); + + /* + * Writing 0 to dirty_writeback_interval will disable periodic writeback + * and a different non-zero value will wakeup the writeback threads. + * wb_wakeup_delayed() would be more appropriate, but it's a pain to + * iterate over all bdis and wbs. + * The reason we do this is to make the change take effect immediately. + */ + if (!ret && write && dirty_writeback_interval && + dirty_writeback_interval != old_interval) + wakeup_flusher_threads(WB_REASON_PERIODIC); + + return ret; +} +#endif + +void laptop_mode_timer_fn(struct timer_list *t) +{ + struct backing_dev_info *backing_dev_info = + from_timer(backing_dev_info, t, laptop_mode_wb_timer); + + wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER); +} + +/* + * We've spun up the disk and we're in laptop mode: schedule writeback + * of all dirty data a few seconds from now. If the flush is already scheduled + * then push it back - the user is still using the disk. + */ +void laptop_io_completion(struct backing_dev_info *info) +{ + mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); +} + +/* + * We're in laptop mode and we've just synced. The sync's writes will have + * caused another writeback to be scheduled by laptop_io_completion. + * Nothing needs to be written back anymore, so we unschedule the writeback. + */ +void laptop_sync_completion(void) +{ + struct backing_dev_info *bdi; + + rcu_read_lock(); + + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + del_timer(&bdi->laptop_mode_wb_timer); + + rcu_read_unlock(); +} + +/* + * If ratelimit_pages is too high then we can get into dirty-data overload + * if a large number of processes all perform writes at the same time. + * + * Here we set ratelimit_pages to a level which ensures that when all CPUs are + * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory + * thresholds. + */ + +void writeback_set_ratelimit(void) +{ + struct wb_domain *dom = &global_wb_domain; + unsigned long background_thresh; + unsigned long dirty_thresh; + + global_dirty_limits(&background_thresh, &dirty_thresh); + dom->dirty_limit = dirty_thresh; + ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); + if (ratelimit_pages < 16) + ratelimit_pages = 16; +} + +static int page_writeback_cpu_online(unsigned int cpu) +{ + writeback_set_ratelimit(); + return 0; +} + +#ifdef CONFIG_SYSCTL + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + +static struct ctl_table vm_page_writeback_sysctls[] = { + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = SYSCTL_LONG_ONE, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = (void *)&dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + {} +}; +#endif + +/* + * Called early on to tune the page writeback dirty limits. + * + * We used to scale dirty pages according to how total memory + * related to pages that could be allocated for buffers. + * + * However, that was when we used "dirty_ratio" to scale with + * all memory, and we don't do that any more. "dirty_ratio" + * is now applied to total non-HIGHPAGE memory, and as such we can't + * get into the old insane situation any more where we had + * large amounts of dirty pages compared to a small amount of + * non-HIGHMEM memory. + * + * But we might still want to scale the dirty_ratio by how + * much memory the box has.. + */ +void __init page_writeback_init(void) +{ + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); + + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online", + page_writeback_cpu_online, NULL); + cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL, + page_writeback_cpu_online); +#ifdef CONFIG_SYSCTL + register_sysctl_init("vm", vm_page_writeback_sysctls); +#endif +} + +/** + * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * @mapping: address space structure to write + * @start: starting page index + * @end: ending page index (inclusive) + * + * This function scans the page range from @start to @end (inclusive) and tags + * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is + * that write_cache_pages (or whoever calls this function) will then use + * TOWRITE tag to identify pages eligible for writeback. This mechanism is + * used to avoid livelocking of writeback by a process steadily creating new + * dirty pages in the file (thus it is important for this function to be quick + * so that it can tag pages faster than a dirtying process can create them). + */ +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} +EXPORT_SYMBOL(tag_pages_for_writeback); + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + * + * To avoid livelocks (when other process dirties new pages), we first tag + * pages which should be written back with TOWRITE tag and only then start + * writing them. For data-integrity sync we have to be careful so that we do + * not miss some pages (e.g., because some other process has cleared TOWRITE + * tag we set). The rule we follow is that TOWRITE tag can be cleared only + * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. + * + * Return: %0 on success, negative error code otherwise + */ +int write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, writepage_t writepage, + void *data) +{ + int ret = 0; + int done = 0; + int error; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; + xa_mark_t tag; + + pagevec_init(&pvec); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { + tag_pages_for_writeback(mapping, index, end); + tag = PAGECACHE_TAG_TOWRITE; + } else { + tag = PAGECACHE_TAG_DIRTY; + } + done_index = index; + while (!done && (index <= end)) { + int i; + + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + done_index = page->index; + + lock_page(page); + + /* + * Page truncated or invalidated. We can freely skip it + * then, even for data integrity operations: the page + * has disappeared concurrently, so there could be no + * real expectation of this data integrity operation + * even if there is now a new, dirty page at the same + * pagecache address. + */ + if (unlikely(page->mapping != mapping)) { +continue_unlock: + unlock_page(page); + continue; + } + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (PageWriteback(page)) { + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + else + goto continue_unlock; + } + + BUG_ON(PageWriteback(page)); + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); + error = (*writepage)(page, wbc, data); + if (unlikely(error)) { + /* + * Handle errors according to the type of + * writeback. There's no need to continue for + * background writeback. Just push done_index + * past this page so media errors won't choke + * writeout for the entire file. For integrity + * writeback, we must process the entire dirty + * set regardless of errors because the fs may + * still have state to clear for each page. In + * that case we continue processing and return + * the first error. + */ + if (error == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + error = 0; + } else if (wbc->sync_mode != WB_SYNC_ALL) { + ret = error; + done_index = page->index + 1; + done = 1; + break; + } + if (!ret) + ret = error; + } + + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { + done = 1; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} +EXPORT_SYMBOL(write_cache_pages); + +/* + * Function used by generic_writepages to call the real writepage + * function and set the mapping flags on error + */ +static int __writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +/** + * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * Return: %0 on success, negative error code otherwise + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct blk_plug plug; + int ret; + + /* deal with chardevs and other special file */ + if (!mapping->a_ops->writepage) + return 0; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; +} + +EXPORT_SYMBOL(generic_writepages); + +int do_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + int ret; + struct bdi_writeback *wb; + + if (wbc->nr_to_write <= 0) + return 0; + wb = inode_to_wb_wbc(mapping->host, wbc); + wb_bandwidth_estimate_start(wb); + while (1) { + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) + break; + + /* + * Lacking an allocation context or the locality or writeback + * state of any of the inode's pages, throttle based on + * writeback activity on the local node. It's as good a + * guess as any. + */ + reclaim_throttle(NODE_DATA(numa_node_id()), + VMSCAN_THROTTLE_WRITEBACK); + } + /* + * Usually few pages are written by now from those we've just submitted + * but if there's constant writeback being submitted, this makes sure + * writeback bandwidth is updated once in a while. + */ + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + + BANDWIDTH_INTERVAL)) + wb_update_bandwidth(wb); + return ret; +} + +/** + * folio_write_one - write out a single folio and wait on I/O. + * @folio: The folio to write. + * + * The folio must be locked by the caller and will be unlocked upon return. + * + * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this + * function returns. + * + * Return: %0 on success, negative error code otherwise + */ +int folio_write_one(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + int ret = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = folio_nr_pages(folio), + }; + + BUG_ON(!folio_test_locked(folio)); + + folio_wait_writeback(folio); + + if (folio_clear_dirty_for_io(folio)) { + folio_get(folio); + ret = mapping->a_ops->writepage(&folio->page, &wbc); + if (ret == 0) + folio_wait_writeback(folio); + folio_put(folio); + } else { + folio_unlock(folio); + } + + if (!ret) + ret = filemap_check_errors(mapping); + return ret; +} +EXPORT_SYMBOL(folio_write_one); + +/* + * For address_spaces which do not use buffers nor write back. + */ +bool noop_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + if (!folio_test_dirty(folio)) + return !folio_test_set_dirty(folio); + return false; +} +EXPORT_SYMBOL(noop_dirty_folio); + +/* + * Helper function for set_page_dirty family. + * + * Caller must hold lock_page_memcg(). + * + * NOTE: This relies on being atomic wrt interrupts. + */ +static void folio_account_dirtied(struct folio *folio, + struct address_space *mapping) +{ + struct inode *inode = mapping->host; + + trace_writeback_dirty_folio(folio, mapping); + + if (mapping_can_writeback(mapping)) { + struct bdi_writeback *wb; + long nr = folio_nr_pages(folio); + + inode_attach_wb(inode, &folio->page); + wb = inode_to_wb(inode); + + __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + __node_stat_mod_folio(folio, NR_DIRTIED, nr); + wb_stat_mod(wb, WB_RECLAIMABLE, nr); + wb_stat_mod(wb, WB_DIRTIED, nr); + task_io_account_write(nr * PAGE_SIZE); + current->nr_dirtied += nr; + __this_cpu_add(bdp_ratelimits, nr); + + mem_cgroup_track_foreign_dirty(folio, wb); + } +} + +/* + * Helper function for deaccounting dirty page without writeback. + * + * Caller must hold lock_page_memcg(). + */ +void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) +{ + long nr = folio_nr_pages(folio); + + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + task_io_account_cancelled_write(nr * PAGE_SIZE); +} + +/* + * Mark the folio dirty, and set it dirty in the page cache, and mark + * the inode dirty. + * + * If warn is true, then emit a warning if the folio is not uptodate and has + * not been truncated. + * + * The caller must hold lock_page_memcg(). Most callers have the folio + * locked. A few have the folio blocked from truncation through other + * means (eg zap_page_range() has it mapped and is holding the page table + * lock). This can also be called from mark_buffer_dirty(), which I + * cannot prove is always protected against truncate. + */ +void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, + int warn) +{ + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + if (folio->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); + folio_account_dirtied(folio, mapping); + __xa_set_mark(&mapping->i_pages, folio_index(folio), + PAGECACHE_TAG_DIRTY); + } + xa_unlock_irqrestore(&mapping->i_pages, flags); +} + +/** + * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads. + * @mapping: Address space this folio belongs to. + * @folio: Folio to be marked as dirty. + * + * Filesystems which do not use buffer heads should call this function + * from their set_page_dirty address space operation. It ignores the + * contents of folio_get_private(), so if the filesystem marks individual + * blocks as dirty, the filesystem should handle that itself. + * + * This is also sometimes used by filesystems which use buffer_heads when + * a single buffer is being dirtied: we want to set the folio dirty in + * that case, but not all the buffers. This is a "bottom-up" dirtying, + * whereas block_dirty_folio() is a "top-down" dirtying. + * + * The caller must ensure this doesn't race with truncation. Most will + * simply hold the folio lock, but e.g. zap_pte_range() calls with the + * folio mapped and the pte lock held, which also locks out truncation. + */ +bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + folio_memcg_lock(folio); + if (folio_test_set_dirty(folio)) { + folio_memcg_unlock(folio); + return false; + } + + __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); + folio_memcg_unlock(folio); + + if (mapping->host) { + /* !PageAnon && !swapper_space */ + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return true; +} +EXPORT_SYMBOL(filemap_dirty_folio); + +/** + * folio_account_redirty - Manually account for redirtying a page. + * @folio: The folio which is being redirtied. + * + * Most filesystems should call folio_redirty_for_writepage() instead + * of this fuction. If your filesystem is doing writeback outside the + * context of a writeback_control(), it can call this when redirtying + * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED, + * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN, + * WB_WRITTEN) in long term. The mismatches will lead to systematic errors + * in balanced_dirty_ratelimit and the dirty pages position control. + */ +void folio_account_redirty(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + + if (mapping && mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + long nr = folio_nr_pages(folio); + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + current->nr_dirtied -= nr; + node_stat_mod_folio(folio, NR_DIRTIED, -nr); + wb_stat_mod(wb, WB_DIRTIED, -nr); + unlocked_inode_to_wb_end(inode, &cookie); + } +} +EXPORT_SYMBOL(folio_account_redirty); + +/** + * folio_redirty_for_writepage - Decline to write a dirty folio. + * @wbc: The writeback control. + * @folio: The folio. + * + * When a writepage implementation decides that it doesn't want to write + * @folio for some reason, it should call this function, unlock @folio and + * return 0. + * + * Return: True if we redirtied the folio. False if someone else dirtied + * it first. + */ +bool folio_redirty_for_writepage(struct writeback_control *wbc, + struct folio *folio) +{ + bool ret; + long nr = folio_nr_pages(folio); + + wbc->pages_skipped += nr; + ret = filemap_dirty_folio(folio->mapping, folio); + folio_account_redirty(folio); + + return ret; +} +EXPORT_SYMBOL(folio_redirty_for_writepage); + +/** + * folio_mark_dirty - Mark a folio as being modified. + * @folio: The folio. + * + * The folio may not be truncated while this function is running. + * Holding the folio lock is sufficient to prevent truncation, but some + * callers cannot acquire a sleeping lock. These callers instead hold + * the page table lock for a page table which contains at least one page + * in this folio. Truncation will block on the page table lock as it + * unmaps pages before removing the folio from its mapping. + * + * Return: True if the folio was newly dirtied, false if it was already dirty. + */ +bool folio_mark_dirty(struct folio *folio) +{ + struct address_space *mapping = folio_mapping(folio); + + if (likely(mapping)) { + /* + * readahead/lru_deactivate_page could remain + * PG_readahead/PG_reclaim due to race with folio_end_writeback + * About readahead, if the folio is written, the flags would be + * reset. So no problem. + * About lru_deactivate_page, if the folio is redirtied, + * the flag will be reset. So no problem. but if the + * folio is used by readahead it will confuse readahead + * and make it restart the size rampup process. But it's + * a trivial problem. + */ + if (folio_test_reclaim(folio)) + folio_clear_reclaim(folio); + return mapping->a_ops->dirty_folio(mapping, folio); + } + + return noop_dirty_folio(mapping, folio); +} +EXPORT_SYMBOL(folio_mark_dirty); + +/* + * set_page_dirty() is racy if the caller has no reference against + * page->mapping->host, and if the page is unlocked. This is because another + * CPU could truncate the page off the mapping and then free the mapping. + * + * Usually, the page _is_ locked, or the caller is a user-space process which + * holds a reference on the inode by having an open file. + * + * In other cases, the page should be locked before running set_page_dirty(). + */ +int set_page_dirty_lock(struct page *page) +{ + int ret; + + lock_page(page); + ret = set_page_dirty(page); + unlock_page(page); + return ret; +} +EXPORT_SYMBOL(set_page_dirty_lock); + +/* + * This cancels just the dirty bit on the kernel page itself, it does NOT + * actually remove dirty bits on any mmap's that may be around. It also + * leaves the page tagged dirty, so any sync activity will still find it on + * the dirty lists, and in particular, clear_page_dirty_for_io() will still + * look at the dirty bits in the VM. + * + * Doing this should *normally* only ever be done when a page is truncated, + * and is not actually mapped anywhere at all. However, fs/buffer.c does + * this when it notices that somebody has cleaned out all the buffers on a + * page without actually doing it through the VM. Can you say "ext3 is + * horribly ugly"? Thought you could. + */ +void __folio_cancel_dirty(struct folio *folio) +{ + struct address_space *mapping = folio_mapping(folio); + + if (mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + + folio_memcg_lock(folio); + wb = unlocked_inode_to_wb_begin(inode, &cookie); + + if (folio_test_clear_dirty(folio)) + folio_account_cleaned(folio, wb); + + unlocked_inode_to_wb_end(inode, &cookie); + folio_memcg_unlock(folio); + } else { + folio_clear_dirty(folio); + } +} +EXPORT_SYMBOL(__folio_cancel_dirty); + +/* + * Clear a folio's dirty flag, while caring for dirty memory accounting. + * Returns true if the folio was previously dirty. + * + * This is for preparing to put the folio under writeout. We leave + * the folio tagged as dirty in the xarray so that a concurrent + * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk. + * The ->writepage implementation will run either folio_start_writeback() + * or folio_mark_dirty(), at which stage we bring the folio's dirty flag + * and xarray dirty tag back into sync. + * + * This incoherency between the folio's dirty flag and xarray tag is + * unfortunate, but it only exists while the folio is locked. + */ +bool folio_clear_dirty_for_io(struct folio *folio) +{ + struct address_space *mapping = folio_mapping(folio); + bool ret = false; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (mapping && mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + + /* + * Yes, Virginia, this is indeed insane. + * + * We use this sequence to make sure that + * (a) we account for dirty stats properly + * (b) we tell the low-level filesystem to + * mark the whole folio dirty if it was + * dirty in a pagetable. Only to then + * (c) clean the folio again and return 1 to + * cause the writeback. + * + * This way we avoid all nasty races with the + * dirty bit in multiple places and clearing + * them concurrently from different threads. + * + * Note! Normally the "folio_mark_dirty(folio)" + * has no effect on the actual dirty bit - since + * that will already usually be set. But we + * need the side effects, and it can help us + * avoid races. + * + * We basically use the folio "master dirty bit" + * as a serialization point for all the different + * threads doing their things. + */ + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + /* + * We carefully synchronise fault handlers against + * installing a dirty pte and marking the folio dirty + * at this point. We do this by having them hold the + * page lock while dirtying the folio, and folios are + * always locked coming in here, so we get the desired + * exclusion. + */ + wb = unlocked_inode_to_wb_begin(inode, &cookie); + if (folio_test_clear_dirty(folio)) { + long nr = folio_nr_pages(folio); + lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + wb_stat_mod(wb, WB_RECLAIMABLE, -nr); + ret = true; + } + unlocked_inode_to_wb_end(inode, &cookie); + return ret; + } + return folio_test_clear_dirty(folio); +} +EXPORT_SYMBOL(folio_clear_dirty_for_io); + +static void wb_inode_writeback_start(struct bdi_writeback *wb) +{ + atomic_inc(&wb->writeback_inodes); +} + +static void wb_inode_writeback_end(struct bdi_writeback *wb) +{ + unsigned long flags; + atomic_dec(&wb->writeback_inodes); + /* + * Make sure estimate of writeback throughput gets updated after + * writeback completed. We delay the update by BANDWIDTH_INTERVAL + * (which is the interval other bandwidth updates use for batching) so + * that if multiple inodes end writeback at a similar time, they get + * batched into one bandwidth update. + */ + spin_lock_irqsave(&wb->work_lock, flags); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL); + spin_unlock_irqrestore(&wb->work_lock, flags); +} + +bool __folio_end_writeback(struct folio *folio) +{ + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; + + folio_memcg_lock(folio); + if (mapping && mapping_use_writeback_tags(mapping)) { + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + unsigned long flags; + + xa_lock_irqsave(&mapping->i_pages, flags); + ret = folio_test_clear_writeback(folio); + if (ret) { + __xa_clear_mark(&mapping->i_pages, folio_index(folio), + PAGECACHE_TAG_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); + } + } + + if (mapping->host && !mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK)) + sb_clear_inode_writeback(mapping->host); + + xa_unlock_irqrestore(&mapping->i_pages, flags); + } else { + ret = folio_test_clear_writeback(folio); + } + if (ret) { + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); + } + folio_memcg_unlock(folio); + return ret; +} + +bool __folio_start_writeback(struct folio *folio, bool keep_write) +{ + long nr = folio_nr_pages(folio); + struct address_space *mapping = folio_mapping(folio); + bool ret; + int access_ret; + + folio_memcg_lock(folio); + if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + ret = folio_test_set_writeback(folio); + if (!ret) { + bool on_wblist; + + on_wblist = mapping_tagged(mapping, + PAGECACHE_TAG_WRITEBACK); + + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + wb_stat_mod(wb, WB_WRITEBACK, nr); + if (!on_wblist) + wb_inode_writeback_start(wb); + } + + /* + * We can come through here when swapping + * anonymous folios, so we don't necessarily + * have an inode to track for sync. + */ + if (mapping->host && !on_wblist) + sb_mark_inode_writeback(mapping->host); + } + if (!folio_test_dirty(folio)) + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + if (!keep_write) + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); + } else { + ret = folio_test_set_writeback(folio); + } + if (!ret) { + lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); + } + folio_memcg_unlock(folio); + access_ret = arch_make_folio_accessible(folio); + /* + * If writeback has been triggered on a page that cannot be made + * accessible, it is too late to recover here. + */ + VM_BUG_ON_FOLIO(access_ret != 0, folio); + + return ret; +} +EXPORT_SYMBOL(__folio_start_writeback); + +/** + * folio_wait_writeback - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + */ +void folio_wait_writeback(struct folio *folio) +{ + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + folio_wait_bit(folio, PG_writeback); + } +} +EXPORT_SYMBOL_GPL(folio_wait_writeback); + +/** + * folio_wait_writeback_killable - Wait for a folio to finish writeback. + * @folio: The folio to wait for. + * + * If the folio is currently being written back to storage, wait for the + * I/O to complete or a fatal signal to arrive. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + * Return: 0 on success, -EINTR if we get a fatal signal while waiting. + */ +int folio_wait_writeback_killable(struct folio *folio) +{ + while (folio_test_writeback(folio)) { + trace_folio_wait_writeback(folio, folio_mapping(folio)); + if (folio_wait_bit_killable(folio, PG_writeback)) + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); + +/** + * folio_wait_stable() - wait for writeback to finish, if necessary. + * @folio: The folio to wait on. + * + * This function determines if the given folio is related to a backing + * device that requires folio contents to be held stable during writeback. + * If so, then it will wait for any pending writeback to complete. + * + * Context: Sleeps. Must be called in process context and with + * no spinlocks held. Caller should hold a reference on the folio. + * If the folio is not locked, writeback may start again after writeback + * has finished. + */ +void folio_wait_stable(struct folio *folio) +{ + if (mapping_stable_writes(folio_mapping(folio))) + folio_wait_writeback(folio); +} +EXPORT_SYMBOL_GPL(folio_wait_stable); diff --git a/mm/page_alloc.c b/mm/page_alloc.c new file mode 100644 index 000000000..c783806ee --- /dev/null +++ b/mm/page_alloc.c @@ -0,0 +1,9728 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/page_alloc.c + * + * Manages the free list, the system allocates free pages here. + * Note that kmalloc() lives in slab.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 + * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "shuffle.h" +#include "page_reporting.h" +#include "swap.h" + +/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ +typedef int __bitwise fpi_t; + +/* No special request */ +#define FPI_NONE ((__force fpi_t)0) + +/* + * Skip free page reporting notification for the (possibly merged) page. + * This does not hinder free page reporting from grabbing the page, + * reporting it and marking it "reported" - it only skips notifying + * the free page reporting infrastructure about a newly freed page. For + * example, used when temporarily pulling a page from a freelist and + * putting it back unmodified. + */ +#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) + +/* + * Place the (possibly merged) page to the tail of the freelist. Will ignore + * page shuffling (relevant code - e.g., memory onlining - is expected to + * shuffle the whole zone). + * + * Note: No code should rely on this flag for correctness - it's purely + * to allow for optimizations when handing back either fresh pages + * (memory onlining) or untouched pages (page isolation, free page + * reporting). + */ +#define FPI_TO_TAIL ((__force fpi_t)BIT(1)) + +/* + * Don't poison memory with KASAN (only for the tag-based modes). + * During boot, all non-reserved memblock memory is exposed to page_alloc. + * Poisoning all that memory lengthens boot time, especially on systems with + * large amount of RAM. This flag is used to skip that poisoning. + * This is only done for the tag-based KASAN modes, as those are able to + * detect memory corruptions with the memory tags assigned by default. + * All memory allocated normally after boot gets poisoned as usual. + */ +#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) + +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + +/* + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid + * a migration causing the wrong PCP to be locked and remote memory being + * potentially allocated, pin the task to the CPU for the lookup+lock. + * preempt_disable is used on !RT because it is faster than migrate_disable. + * migrate_disable is used on RT because otherwise RT spinlock usage is + * interfered with and a high priority task cannot preempt the allocator. + */ +#ifndef CONFIG_PREEMPT_RT +#define pcpu_task_pin() preempt_disable() +#define pcpu_task_unpin() preempt_enable() +#else +#define pcpu_task_pin() migrate_disable() +#define pcpu_task_unpin() migrate_enable() +#endif + +/* + * Generic helper to lookup and a per-cpu variable with an embedded spinlock. + * Return value should be used with equivalent unlock helper. + */ +#define pcpu_spin_lock(type, member, ptr) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock(&_ret->member); \ + _ret; \ +}) + +#define pcpu_spin_trylock(type, member, ptr) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + if (!spin_trylock(&_ret->member)) { \ + pcpu_task_unpin(); \ + _ret = NULL; \ + } \ + _ret; \ +}) + +#define pcpu_spin_unlock(member, ptr) \ +({ \ + spin_unlock(&ptr->member); \ + pcpu_task_unpin(); \ +}) + +/* struct per_cpu_pages specific helpers. */ +#define pcp_spin_lock(ptr) \ + pcpu_spin_lock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_trylock(ptr) \ + pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) + +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + +DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in . + */ +DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +#endif + +static DEFINE_MUTEX(pcpu_drain_mutex); + +#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY +volatile unsigned long latent_entropy __latent_entropy; +EXPORT_SYMBOL(latent_entropy); +#endif + +/* + * Array of node states. + */ +nodemask_t node_states[NR_NODE_STATES] __read_mostly = { + [N_POSSIBLE] = NODE_MASK_ALL, + [N_ONLINE] = { { [0] = 1UL } }, +#ifndef CONFIG_NUMA + [N_NORMAL_MEMORY] = { { [0] = 1UL } }, +#ifdef CONFIG_HIGHMEM + [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif + [N_MEMORY] = { { [0] = 1UL } }, + [N_CPU] = { { [0] = 1UL } }, +#endif /* NUMA */ +}; +EXPORT_SYMBOL(node_states); + +atomic_long_t _totalram_pages __read_mostly; +EXPORT_SYMBOL(_totalram_pages); +unsigned long totalreserve_pages __read_mostly; +unsigned long totalcma_pages __read_mostly; + +int percpu_pagelist_high_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); +EXPORT_SYMBOL(init_on_alloc); + +DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); +EXPORT_SYMBOL(init_on_free); + +static bool _init_on_alloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); +static int __init early_init_on_alloc(char *buf) +{ + + return kstrtobool(buf, &_init_on_alloc_enabled_early); +} +early_param("init_on_alloc", early_init_on_alloc); + +static bool _init_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); +static int __init early_init_on_free(char *buf) +{ + return kstrtobool(buf, &_init_on_free_enabled_early); +} +early_param("init_on_free", early_init_on_free); + +/* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when + * freeing from pcplists in most cases, at the cost of possibly becoming stale. + * Also the migratetype set in the page does not necessarily match the pcplist + * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any + * other index - this ensures that it will be put on the correct CMA freelist. + */ +static inline int get_pcppage_migratetype(struct page *page) +{ + return page->index; +} + +static inline void set_pcppage_migratetype(struct page *page, int migratetype) +{ + page->index = migratetype; +} + +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with system_transition_mutex held + * (gfp_allowed_mask also should only be modified with system_transition_mutex + * held, unless the suspend/hibernate code is guaranteed not to run in parallel + * with that modification). + */ + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&system_transition_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); +} + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + return false; + return true; +} +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE +unsigned int pageblock_order __read_mostly; +#endif + +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags); + +/* + * results with 256, 32 in the lowmem_reserve sysctl: + * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) + * 1G machine -> (16M dma, 784M normal, 224M high) + * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA + * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL + * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA + * + * TBD: should special case ZONE_DMA32 machines here - in those we normally + * don't need any ZONE_NORMAL reservation + */ +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { +#ifdef CONFIG_ZONE_DMA + [ZONE_DMA] = 256, +#endif +#ifdef CONFIG_ZONE_DMA32 + [ZONE_DMA32] = 256, +#endif + [ZONE_NORMAL] = 32, +#ifdef CONFIG_HIGHMEM + [ZONE_HIGHMEM] = 0, +#endif + [ZONE_MOVABLE] = 0, +}; + +static char * const zone_names[MAX_NR_ZONES] = { +#ifdef CONFIG_ZONE_DMA + "DMA", +#endif +#ifdef CONFIG_ZONE_DMA32 + "DMA32", +#endif + "Normal", +#ifdef CONFIG_HIGHMEM + "HighMem", +#endif + "Movable", +#ifdef CONFIG_ZONE_DEVICE + "Device", +#endif +}; + +const char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", + "HighAtomic", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + +compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + [NULL_COMPOUND_DTOR] = NULL, + [COMPOUND_PAGE_DTOR] = free_compound_page, +#ifdef CONFIG_HUGETLB_PAGE + [HUGETLB_PAGE_DTOR] = free_huge_page, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, +#endif +}; + +int min_free_kbytes = 1024; +int user_min_free_kbytes = -1; +int watermark_boost_factor __read_mostly = 15000; +int watermark_scale_factor = 10; + +static unsigned long nr_kernel_pages __initdata; +static unsigned long nr_all_pages __initdata; +static unsigned long dma_reserve __initdata; + +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; +static unsigned long required_kernelcore __initdata; +static unsigned long required_kernelcore_percent __initdata; +static unsigned long required_movablecore __initdata; +static unsigned long required_movablecore_percent __initdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; +bool mirrored_kernelcore __initdata_memblock; + +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +int movable_zone; +EXPORT_SYMBOL(movable_zone); + +#if MAX_NUMNODES > 1 +unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; +unsigned int nr_online_nodes __read_mostly = 1; +EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); +#endif + +int page_group_by_mobility_disabled __read_mostly; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +static inline bool deferred_pages_enabled(void) +{ + return static_branch_unlikely(&deferred_pages); +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __meminit early_page_uninitialised(unsigned long pfn) +{ + int nid = early_pfn_to_nid(pfn); + + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns true when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn) +{ + static unsigned long prev_end_pfn, nr_initialised; + + if (early_page_ext_enabled()) + return false; + /* + * prev_end_pfn static that contains the end of previous zone + * No need to protect because called very early in boot before smp_init. + */ + if (prev_end_pfn != end_pfn) { + prev_end_pfn = end_pfn; + nr_initialised = 0; + } + + /* Always populate low zones for address-constrained allocations */ + if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) + return false; + + if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) + return true; + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + nr_initialised++; + if ((nr_initialised > PAGES_PER_SECTION) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + NODE_DATA(nid)->first_deferred_pfn = pfn; + return true; + } + return false; +} +#else +static inline bool deferred_pages_enabled(void) +{ + return false; +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) +{ + return false; +} +#endif + +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(const struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return section_to_usemap(__pfn_to_section(pfn)); +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); +#else + pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); +#endif /* CONFIG_SPARSEMEM */ + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +} + +static __always_inline +unsigned long __get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + /* + * This races, without locks, with set_pfnblock_flags_mask(). Ensure + * a consistent read of the memory array, so that results, even though + * racy, are not corrupted. + */ + word = READ_ONCE(bitmap[word_bitidx]); + return (word >> bitidx) & mask; +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, mask); +} + +static __always_inline int get_pfnblock_migratetype(const struct page *page, + unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + mask <<= bitidx; + flags <<= bitidx; + + word = READ_ONCE(bitmap[word_bitidx]); + do { + } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); +} + +void set_pageblock_migratetype(struct page *page, int migratetype) +{ + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) + migratetype = MIGRATE_UNMOVABLE; + + set_pfnblock_flags_mask(page, (unsigned long)migratetype, + page_to_pfn(page), MIGRATETYPE_MASK); +} + +#ifdef CONFIG_DEBUG_VM +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) +{ + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); + unsigned long sp, start_pfn; + + do { + seq = zone_span_seqbegin(zone); + start_pfn = zone->zone_start_pfn; + sp = zone->spanned_pages; + if (!zone_spans_pfn(zone, pfn)) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + if (ret) + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); + + return ret; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ + if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int __maybe_unused bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) + return 1; + if (!page_is_consistent(zone, page)) + return 1; + + return 0; +} +#else +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page, const char *reason) +{ + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; + + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + goto out; + } + if (nr_unshown) { + pr_alert( + "BUG: Bad page state: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; + } + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + dump_page(page, reason); + + print_modules(); + dump_stack(); +out: + /* Leave bad fields for debug, except PageBuddy could make trouble */ + page_mapcount_reset(page); /* remove PageBuddy */ + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} + +static inline unsigned int order_to_pindex(int migratetype, int order) +{ + int base = order; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(order != pageblock_order); + return NR_LOWORDER_PCP_LISTS; + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return (MIGRATE_PCPTYPES * base) + migratetype; +} + +static inline int pindex_to_order(unsigned int pindex) +{ + int order = pindex / MIGRATE_PCPTYPES; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pindex == NR_LOWORDER_PCP_LISTS) + order = pageblock_order; +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return order; +} + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + +static inline void free_the_page(struct page *page, unsigned int order) +{ + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); + else + __free_pages_ok(page, order, FPI_NONE); +} + +/* + * Higher-order pages are called "compound pages". They are structured thusly: + * + * The first PAGE_SIZE page is called the "head page" and have PG_head set. + * + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. + * + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. + * + * The first tail page's ->compound_order holds the order of allocation. + * This usage means that zero-order pages may not be compound. + */ + +void free_compound_page(struct page *page) +{ + mem_cgroup_uncharge(page_folio(page)); + free_the_page(page, compound_order(page)); +} + +static void prep_compound_head(struct page *page, unsigned int order) +{ + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); + atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(compound_pincount_ptr(page), 0); +} + +static void prep_compound_tail(struct page *head, int tail_idx) +{ + struct page *p = head + tail_idx; + + p->mapping = TAIL_MAPPING; + set_compound_head(p, head); + set_page_private(p, 0); +} + +void prep_compound_page(struct page *page, unsigned int order) +{ + int i; + int nr_pages = 1 << order; + + __SetPageHead(page); + for (i = 1; i < nr_pages; i++) + prep_compound_tail(page, i); + + prep_compound_head(page, order); +} + +void destroy_large_folio(struct folio *folio) +{ + enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); + compound_page_dtors[dtor](&folio->page); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +bool _debug_pagealloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled_early); +DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +EXPORT_SYMBOL(_debug_pagealloc_enabled); + +DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + +static int __init early_debug_pagealloc(char *buf) +{ + return kstrtobool(buf, &_debug_pagealloc_enabled_early); +} +early_param("debug_pagealloc", early_debug_pagealloc); + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + pr_err("Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + pr_info("Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); + +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + if (!debug_guardpage_enabled()) + return false; + + if (order >= debug_guardpage_minorder()) + return false; + + __SetPageGuard(page); + INIT_LIST_HEAD(&page->buddy_list); + set_page_private(page, order); + /* Guard pages are not available for any usage */ + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); + + return true; +} + +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) +{ + if (!debug_guardpage_enabled()) + return; + + __ClearPageGuard(page); + + set_page_private(page, 0); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, (1 << order), migratetype); +} +#else +static inline bool set_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) { return false; } +static inline void clear_page_guard(struct zone *zone, struct page *page, + unsigned int order, int migratetype) {} +#endif + +/* + * Enable static keys related to various memory debugging and hardening options. + * Some override others, and depend on early params that are evaluated in the + * order of appearance. So we need to first gather the full picture of what was + * enabled, and then make decisions. + */ +void __init init_mem_debugging_and_hardening(void) +{ + bool page_poisoning_requested = false; + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) { + static_branch_enable(&_page_poisoning_enabled); + page_poisoning_requested = true; + } +#endif + + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && + page_poisoning_requested) { + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc and init_on_free\n"); + _init_on_alloc_enabled_early = false; + _init_on_free_enabled_early = false; + } + + if (_init_on_alloc_enabled_early) + static_branch_enable(&init_on_alloc); + else + static_branch_disable(&init_on_alloc); + + if (_init_on_free_enabled_early) + static_branch_enable(&init_on_free); + else + static_branch_disable(&init_on_free); + + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return; + + static_branch_enable(&_debug_pagealloc_enabled); + + if (!debug_guardpage_minorder()) + return; + + static_branch_enable(&_debug_guardpage_enabled); +#endif +} + +static inline void set_buddy_order(struct page *page, unsigned int order) +{ + set_page_private(page, order); + __SetPageBuddy(page); +} + +#ifdef CONFIG_COMPACTION +static inline struct capture_control *task_capc(struct zone *zone) +{ + struct capture_control *capc = current->capture_control; + + return unlikely(capc) && + !(current->flags & PF_KTHREAD) && + !capc->page && + capc->cc->zone == zone ? capc : NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + if (!capc || order != capc->cc->order) + return false; + + /* Do not accidentally pollute CMA or isolated regions*/ + if (is_migrate_cma(migratetype) || + is_migrate_isolate(migratetype)) + return false; + + /* + * Do not let lower order allocations pollute a movable pageblock. + * This might let an unmovable request use a reclaimable pageblock + * and vice-versa but no more than normal fallback logic which can + * have trouble finding a high-order free page. + */ + if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) + return false; + + capc->page = page; + return true; +} + +#else +static inline struct capture_control *task_capc(struct zone *zone) +{ + return NULL; +} + +static inline bool +compaction_capture(struct capture_control *capc, struct page *page, + int order, int migratetype) +{ + return false; +} +#endif /* CONFIG_COMPACTION */ + +/* Used for pages not on another list */ +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add(&page->buddy_list, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); + area->nr_free++; +} + +/* + * Used for pages which are on another list. Move the pages to the tail + * of the list - so the moved pages won't immediately be considered for + * allocation again (e.g., optimization for memory onlining). + */ +static inline void move_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_move_tail(&page->buddy_list, &area->free_list[migratetype]); +} + +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order) +{ + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); + + list_del(&page->buddy_list); + __ClearPageBuddy(page); + set_page_private(page, 0); + zone->free_area[order].nr_free--; +} + +/* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ +static inline bool +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, + struct page *page, unsigned int order) +{ + unsigned long higher_page_pfn; + struct page *higher_page; + + if (order >= MAX_ORDER - 2) + return false; + + higher_page_pfn = buddy_pfn & pfn; + higher_page = page + (higher_page_pfn - pfn); + + return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, + NULL) != NULL; +} + +/* + * Freeing function for a buddy system allocator. + * + * The concept of a buddy system is to maintain direct-mapped table + * (containing bit values) for memory blocks of various "orders". + * The bottom level table contains the map for the smallest allocatable + * units of memory (here, pages), and each level above it describes + * pairs of units from the levels below, hence, "buddies". + * At a high level, all that happens here is marking the table entry + * at the bottom level available, and propagating the changes upward + * as necessary, plus some accounting needed to play nicely with other + * parts of the VM system. + * At each level, we keep a list of pages, which are heads of continuous + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. + * So when we are allocating or freeing one, we can derive the state of the + * other. That is, if we allocate a small block, and both were + * free, the remainder of the region must be split into blocks. + * If a block is freed, and its buddy is also free, then this + * triggers coalescing into a block of larger size. + * + * -- nyc + */ + +static inline void __free_one_page(struct page *page, + unsigned long pfn, + struct zone *zone, unsigned int order, + int migratetype, fpi_t fpi_flags) +{ + struct capture_control *capc = task_capc(zone); + unsigned long buddy_pfn = 0; + unsigned long combined_pfn; + struct page *buddy; + bool to_tail; + + VM_BUG_ON(!zone_is_initialized(zone)); + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + + VM_BUG_ON(migratetype == -1); + if (likely(!is_migrate_isolate(migratetype))) + __mod_zone_freepage_state(zone, 1 << order, migratetype); + + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(bad_range(zone, page), page); + + while (order < MAX_ORDER - 1) { + if (compaction_capture(capc, page, order, migratetype)) { + __mod_zone_freepage_state(zone, -(1 << order), + migratetype); + return; + } + + buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); + if (!buddy) + goto done_merging; + + if (unlikely(order >= pageblock_order)) { + /* + * We want to prevent merge between freepages on pageblock + * without fallbacks and normal pageblock. Without this, + * pageblock isolation could cause incorrect freepage or CMA + * accounting or HIGHATOMIC accounting. + */ + int buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (!migratetype_is_mergeable(migratetype) || + !migratetype_is_mergeable(buddy_mt))) + goto done_merging; + } + + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) + clear_page_guard(zone, buddy, order, migratetype); + else + del_page_from_free_list(buddy, zone, order); + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; + order++; + } + +done_merging: + set_buddy_order(page, order); + + if (fpi_flags & FPI_TO_TAIL) + to_tail = true; + else if (is_shuffle_order(order)) + to_tail = shuffle_pick_tail(); + else + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); + + if (to_tail) + add_to_free_list_tail(page, zone, order, migratetype); + else + add_to_free_list(page, zone, order, migratetype); + + /* Notify page reporting subsystem of freed page */ + if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) + page_reporting_notify_free(order); +} + +/** + * split_free_page() -- split a free page at split_pfn_offset + * @free_page: the original free page + * @order: the order of the page + * @split_pfn_offset: split offset within the page + * + * Return -ENOENT if the free page is changed, otherwise 0 + * + * It is used when the free page crosses two pageblocks with different migratetypes + * at split_pfn_offset within the page. The split free page will be put into + * separate migratetype lists afterwards. Otherwise, the function achieves + * nothing. + */ +int split_free_page(struct page *free_page, + unsigned int order, unsigned long split_pfn_offset) +{ + struct zone *zone = page_zone(free_page); + unsigned long free_page_pfn = page_to_pfn(free_page); + unsigned long pfn; + unsigned long flags; + int free_page_order; + int mt; + int ret = 0; + + if (split_pfn_offset == 0) + return ret; + + spin_lock_irqsave(&zone->lock, flags); + + if (!PageBuddy(free_page) || buddy_order(free_page) != order) { + ret = -ENOENT; + goto out; + } + + mt = get_pageblock_migratetype(free_page); + if (likely(!is_migrate_isolate(mt))) + __mod_zone_freepage_state(zone, -(1UL << order), mt); + + del_page_from_free_list(free_page, zone, order); + for (pfn = free_page_pfn; + pfn < free_page_pfn + (1UL << order);) { + int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); + + free_page_order = min_t(unsigned int, + pfn ? __ffs(pfn) : order, + __fls(split_pfn_offset)); + __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, + mt, FPI_NONE); + pfn += 1UL << free_page_order; + split_pfn_offset -= (1UL << free_page_order); + /* we have done the first part, now switch to second part */ + if (split_pfn_offset == 0) + split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); + } +out: + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} +/* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed + * check if necessary. + */ +static inline bool page_expected_state(struct page *page, + unsigned long check_flags) +{ + if (unlikely(atomic_read(&page->_mapcount) != -1)) + return false; + + if (unlikely((unsigned long)page->mapping | + page_ref_count(page) | +#ifdef CONFIG_MEMCG + page->memcg_data | +#endif + (page->flags & check_flags))) + return false; + + return true; +} + +static const char *page_bad_reason(struct page *page, unsigned long flags) +{ + const char *bad_reason = NULL; + + if (unlikely(atomic_read(&page->_mapcount) != -1)) + bad_reason = "nonzero mapcount"; + if (unlikely(page->mapping != NULL)) + bad_reason = "non-NULL mapping"; + if (unlikely(page_ref_count(page) != 0)) + bad_reason = "nonzero _refcount"; + if (unlikely(page->flags & flags)) { + if (flags == PAGE_FLAGS_CHECK_AT_PREP) + bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; + else + bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; + } +#ifdef CONFIG_MEMCG + if (unlikely(page->memcg_data)) + bad_reason = "page still charged to cgroup"; +#endif + return bad_reason; +} + +static void free_page_is_bad_report(struct page *page) +{ + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); +} + +static inline bool free_page_is_bad(struct page *page) +{ + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) + return false; + + /* Something has gone sideways, find it */ + free_page_is_bad_report(page); + return true; +} + +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping may be compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount"); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * deferred_list.next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page"); + goto out; + } + break; + } + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set"); + goto out; + } + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent"); + goto out; + } + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; +} + +/* + * Skip KASAN memory poisoning when either: + * + * 1. Deferred memory initialization has not yet completed, + * see the explanation below. + * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, + * see the comment next to it. + * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, + * see the comment next to it. + * + * Poisoning pages during deferred memory init will greatly lengthen the + * process and cause problem in large memory systems as the deferred pages + * initialization is done with interrupt disabled. + * + * Assuming that there will be no reference to those newly initialized + * pages before they are ever allocated, this should have no effect on + * KASAN memory tracking as the poison will be properly inserted at page + * allocation time. The only corner case is when pages are allocated by + * on-demand allocation and then freed again before the deferred pages + * initialization is done, but this is not likely to happen. + */ +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +{ + return deferred_pages_enabled() || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); +} + +static void kernel_init_pages(struct page *page, int numpages) +{ + int i; + + /* s390's use of memset() could override KASAN redzones. */ + kasan_disable_current(); + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); + kasan_enable_current(); +} + +static __always_inline bool free_pages_prepare(struct page *page, + unsigned int order, bool check_free, fpi_t fpi_flags) +{ + int bad = 0; + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool init = want_init_on_free(); + + VM_BUG_ON_PAGE(PageTail(page), page); + + trace_mm_page_free(page, order); + kmsan_free_page(page, order); + + if (unlikely(PageHWPoison(page)) && !order) { + /* + * Do not let hwpoison pages hit pcplists/buddy + * Untie memcg state and reset page's owner + */ + if (memcg_kmem_enabled() && PageMemcgKmem(page)) + __memcg_kmem_uncharge_page(page, order); + reset_page_owner(page, order); + page_table_check_free(page, order); + return false; + } + + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + if (compound) { + ClearPageDoubleMap(page); + ClearPageHasHWPoisoned(page); + } + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + if (unlikely(free_page_is_bad(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + } + } + if (PageMappingFlags(page)) + page->mapping = NULL; + if (memcg_kmem_enabled() && PageMemcgKmem(page)) + __memcg_kmem_uncharge_page(page, order); + if (check_free && free_page_is_bad(page)) + bad++; + if (bad) + return false; + + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); + page_table_check_free(page, order); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE << order); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } + + kernel_poison_pages(page, 1 << order); + + /* + * As memory initialization might be integrated into KASAN, + * KASAN poisoning and memory initialization code must be + * kept together to avoid discrepancies in behavior. + * + * With hardware tag-based KASAN, memory tags must be set before the + * page becomes unavailable via debug_pagealloc or arch_free_page. + */ + if (!skip_kasan_poison) { + kasan_poison_pages(page, order, init); + + /* Memory is already initialized if KASAN did it internally. */ + if (kasan_has_integrated_init()) + init = false; + } + if (init) + kernel_init_pages(page, 1 << order); + + /* + * arch_free_page() can make the page's contents inaccessible. s390 + * does this. So nothing which can access the page's contents should + * happen after this. + */ + arch_free_page(page, order); + + debug_pagealloc_unmap_pages(page, 1 << order); + + return true; +} + +#ifdef CONFIG_DEBUG_VM +/* + * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed + * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when + * moved from pcp lists to free lists. + */ +static bool free_pcp_prepare(struct page *page, unsigned int order) +{ + return free_pages_prepare(page, order, true, FPI_NONE); +} + +/* return true if this page has an inappropriate state */ +static bool bulkfree_pcp_prepare(struct page *page) +{ + if (debug_pagealloc_enabled_static()) + return free_page_is_bad(page); + else + return false; +} +#else +/* + * With DEBUG_VM disabled, order-0 pages being freed are checked only when + * moving from pcp lists to free list in order to reduce overhead. With + * debug_pagealloc enabled, they are checked also immediately when being freed + * to the pcp lists. + */ +static bool free_pcp_prepare(struct page *page, unsigned int order) +{ + if (debug_pagealloc_enabled_static()) + return free_pages_prepare(page, order, true, FPI_NONE); + else + return free_pages_prepare(page, order, false, FPI_NONE); +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return free_page_is_bad(page); +} +#endif /* CONFIG_DEBUG_VM */ + +/* + * Frees a number of pages from the PCP lists + * Assumes all pages on list are in same zone. + * count is the number of pages to free. + */ +static void free_pcppages_bulk(struct zone *zone, int count, + struct per_cpu_pages *pcp, + int pindex) +{ + unsigned long flags; + int min_pindex = 0; + int max_pindex = NR_PCP_LISTS - 1; + unsigned int order; + bool isolated_pageblocks; + struct page *page; + + /* + * Ensure proper count is passed which otherwise would stuck in the + * below while (list_empty(list)) loop. + */ + count = min(pcp->count, count); + + /* Ensure requested pindex is drained first. */ + pindex = pindex - 1; + + spin_lock_irqsave(&zone->lock, flags); + isolated_pageblocks = has_isolate_pageblock(zone); + + while (count > 0) { + struct list_head *list; + int nr_pages; + + /* Remove pages from lists in a round-robin fashion. */ + do { + if (++pindex > max_pindex) + pindex = min_pindex; + list = &pcp->lists[pindex]; + if (!list_empty(list)) + break; + + if (pindex == max_pindex) + max_pindex--; + if (pindex == min_pindex) + min_pindex++; + } while (1); + + order = pindex_to_order(pindex); + nr_pages = 1 << order; + do { + int mt; + + page = list_last_entry(list, struct page, pcp_list); + mt = get_pcppage_migratetype(page); + + /* must delete to avoid corrupting pcp list */ + list_del(&page->pcp_list); + count -= nr_pages; + pcp->count -= nr_pages; + + if (bulkfree_pcp_prepare(page)) + continue; + + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); + } while (count > 0 && !list_empty(list)); + } + + spin_unlock_irqrestore(&zone->lock, flags); +} + +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + unsigned int order, + int migratetype, fpi_t fpi_flags) +{ + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); +} + +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + mm_zero_struct_page(page); + set_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + page_kasan_tag_reset(page); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __meminit init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_spans_pfn(zone, pfn)) + break; + } + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + + /* + * no need for atomic set_bit because the struct + * page is not visible yet so nobody should + * access it yet. + */ + __SetPageReserved(page); + } + } +} + +static void __free_pages_ok(struct page *page, unsigned int order, + fpi_t fpi_flags) +{ + unsigned long flags; + int migratetype; + unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); + + if (!free_pages_prepare(page, order, true, fpi_flags)) + return; + + migratetype = get_pfnblock_migratetype(page, pfn); + + spin_lock_irqsave(&zone->lock, flags); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } + __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); + spin_unlock_irqrestore(&zone->lock, flags); + + __count_vm_events(PGFREE, 1 << order); +} + +void __free_pages_core(struct page *page, unsigned int order) +{ + unsigned int nr_pages = 1 << order; + struct page *p = page; + unsigned int loop; + + /* + * When initializing the memmap, __init_single_page() sets the refcount + * of all pages to 1 ("allocated"/"not free"). We have to set the + * refcount of all involved pages to 0. + */ + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); + } + __ClearPageReserved(p); + set_page_count(p, 0); + + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + + /* + * Bypass PCP and place fresh pages right to the tail, primarily + * relevant for memory onlining. + */ + __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); +} + +#ifdef CONFIG_NUMA + +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * treats start/end as pfns. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; + +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + */ +static int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) +{ + unsigned long start_pfn, end_pfn; + int nid; + + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; + + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != NUMA_NO_NODE) { + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; + } + + return nid; +} + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + static DEFINE_SPINLOCK(early_pfn_lock); + int nid; + + spin_lock(&early_pfn_lock); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid < 0) + nid = first_online_node; + spin_unlock(&early_pfn_lock); + + return nid; +} +#endif /* CONFIG_NUMA */ + +void __init memblock_free_pages(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + if (!kmsan_memblock_free_pages(page, order)) { + /* KMSAN will take care of these pages. */ + return; + } + __free_pages_core(page, order); +} + +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_online_page(start_pfn); + if (!start_page) + return NULL; + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = pageblock_end_pfn(block_start_pfn); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + cond_resched(); + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __init deferred_free_range(unsigned long pfn, + unsigned long nr_pages) +{ + struct page *page; + unsigned long i; + + if (!nr_pages) + return; + + page = pfn_to_page(pfn); + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_core(page, pageblock_order); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if (pageblock_aligned(pfn)) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_core(page, 0); + } +} + +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} + +/* + * Returns true if page needs to be initialized or freed to buddy allocator. + * + * We check if a current large page is valid by only checking the validity + * of the head pfn. + */ +static inline bool __init deferred_pfn_valid(unsigned long pfn) +{ + if (pageblock_aligned(pfn) && !pfn_valid(pfn)) + return false; + return true; +} + +/* + * Free pages to buddy allocator. Try to free aligned pages in + * pageblock_nr_pages sizes. + */ +static void __init deferred_free_pages(unsigned long pfn, + unsigned long end_pfn) +{ + unsigned long nr_free = 0; + + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(pfn)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 0; + } else if (pageblock_aligned(pfn)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 1; + } else { + nr_free++; + } + } + /* Free the last block of pages to allocator */ + deferred_free_range(pfn - nr_free, nr_free); +} + +/* + * Initialize struct pages. We minimize pfn page lookups and scheduler checks + * by performing it only once every pageblock_nr_pages. + * Return number of pages initialized. + */ +static unsigned long __init deferred_init_pages(struct zone *zone, + unsigned long pfn, + unsigned long end_pfn) +{ + int nid = zone_to_nid(zone); + unsigned long nr_pages = 0; + int zid = zone_idx(zone); + struct page *page = NULL; + + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(pfn)) { + page = NULL; + continue; + } else if (!page || pageblock_aligned(pfn)) { + page = pfn_to_page(pfn); + } else { + page++; + } + __init_single_page(page, pfn, zid, nid); + nr_pages++; + } + return (nr_pages); +} + +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, then free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + nr_pages += deferred_init_pages(zone, *start_pfn, t); + + if (mo_pfn < *end_pfn) { + *start_pfn = mo_pfn; + break; + } + } + + /* Reset values and now loop through freeing pages as needed */ + swap(j, *i); + + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + deferred_free_pages(spfn, t); + + if (mo_pfn <= epfn) + break; + } + + return nr_pages; +} + +static void __init +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, + void *arg) +{ + unsigned long spfn, epfn; + struct zone *zone = arg; + u64 i; + + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); + + /* + * Initialize and free pages in MAX_ORDER sized increments so that we + * can avoid introducing any issues with the buddy allocator. + */ + while (spfn < end_pfn) { + deferred_init_maxorder(&i, zone, &spfn, &epfn); + cond_resched(); + } +} + +/* An arch may override for more concurrency. */ +__weak int __init +deferred_page_init_max_threads(const struct cpumask *node_cpumask) +{ + return 1; +} + +/* Initialise remaining memory on a node */ +static int __init deferred_init_memmap(void *data) +{ + pg_data_t *pgdat = data; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0; + unsigned long first_init_pfn, flags; + unsigned long start = jiffies; + struct zone *zone; + int zid, max_threads; + u64 i; + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + pgdat_resize_lock(pgdat, &flags); + first_init_pfn = pgdat->first_deferred_pfn; + if (first_init_pfn == ULONG_MAX) { + pgdat_resize_unlock(pgdat, &flags); + pgdat_init_report_one_done(); + return 0; + } + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* + * Once we unlock here, the zone cannot be grown anymore, thus if an + * interrupt thread must allocate this early in boot, zone must be + * pre-grown prior to start of deferred page initialization. + */ + pgdat_resize_unlock(pgdat, &flags); + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) + goto zone_empty; + + max_threads = deferred_page_init_max_threads(cpumask); + + while (spfn < epfn) { + unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); + struct padata_mt_job job = { + .thread_fn = deferred_init_memmap_chunk, + .fn_arg = zone, + .start = spfn, + .size = epfn_align - spfn, + .align = PAGES_PER_SECTION, + .min_chunk = PAGES_PER_SECTION, + .max_threads = max_threads, + }; + + padata_do_multithreaded(&job); + deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + epfn_align); + } +zone_empty: + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("node %d deferred pages initialised in %ums\n", + pgdat->node_id, jiffies_to_msecs(jiffies - start)); + + pgdat_init_report_one_done(); + return 0; +} + +/* + * If this zone has deferred pages, try to grow it by initializing enough + * deferred pages to satisfy the allocation specified by order, rounded up to + * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments + * of SECTION_SIZE bytes by initializing struct pages in increments of + * PAGES_PER_SECTION * sizeof(struct page) bytes. + * + * Return true when zone was grown, otherwise return false. We return true even + * when we grow less than requested, to let the caller decide if there are + * enough pages to satisfy the allocation. + * + * Note: We use noinline because this function is needed only during boot, and + * it is called from a __ref function _deferred_grow_zone. This way we are + * making sure that it is not inlined into permanent text section. + */ +static noinline bool __init +deferred_grow_zone(struct zone *zone, unsigned int order) +{ + unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + pg_data_t *pgdat = zone->zone_pgdat; + unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; + u64 i; + + /* Only the last zone may have deferred pages */ + if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) + return false; + + pgdat_resize_lock(pgdat, &flags); + + /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ + if (first_deferred_pfn != pgdat->first_deferred_pfn) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { + pgdat->first_deferred_pfn = ULONG_MAX; + pgdat_resize_unlock(pgdat, &flags); + /* Retry only once. */ + return first_deferred_pfn != ULONG_MAX; + } + + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn) { + /* update our first deferred PFN for this section */ + first_deferred_pfn = spfn; + + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + touch_nmi_watchdog(); + + /* We should only stop along section boundaries */ + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) + continue; + + /* If our quota has been met we can stop here */ + if (nr_pages >= nr_pages_needed) + break; + } + + pgdat->first_deferred_pfn = spfn; + pgdat_resize_unlock(pgdat, &flags); + + return nr_pages > 0; +} + +/* + * deferred_grow_zone() is __init, but it is called from + * get_page_from_freelist() during early boot until deferred_pages permanently + * disables this call. This is why we have refdata wrapper to avoid warning, + * and to ensure that the function body gets unloaded. + */ +static bool __ref +_deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return deferred_grow_zone(zone, order); +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +void __init page_alloc_init_late(void) +{ + struct zone *zone; + int nid; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); + for_each_node_state(nid, N_MEMORY) { + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + wait_for_completion(&pgdat_init_all_done_comp); + + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ + static_branch_disable(&deferred_pages); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); +#endif + + buffer_init(); + + /* Discard memblock private memory */ + memblock_discard(); + + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); + + for_each_populated_zone(zone) + set_zone_contiguous(zone); +} + +#ifdef CONFIG_CMA +/* Free whole pageblock and set its migration type to MIGRATE_CMA. */ +void __init init_cma_reserved_pageblock(struct page *page) +{ + unsigned i = pageblock_nr_pages; + struct page *p = page; + + do { + __ClearPageReserved(p); + set_page_count(p, 0); + } while (++p, --i); + + set_pageblock_migratetype(page, MIGRATE_CMA); + set_page_refcounted(page); + __free_pages(page, pageblock_order); + + adjust_managed_page_count(page, pageblock_nr_pages); + page_zone(page)->cma_pages += pageblock_nr_pages; +} +#endif + +/* + * The order of subdivision here is critical for the IO subsystem. + * Please do not alter this order without good reasons and regression + * testing. Specifically, as large blocks of memory are subdivided, + * the order in which smaller blocks are delivered depends on the order + * they're subdivided in this function. This is the primary factor + * influencing the order in which pages are delivered to the IO + * subsystem according to empirical testing, and this is also justified + * by considering the behavior of a buddy system containing a single + * large block of memory acted on by a series of small allocations. + * This behavior is a critical factor in sglist merging's success. + * + * -- nyc + */ +static inline void expand(struct zone *zone, struct page *page, + int low, int high, int migratetype) +{ + unsigned long size = 1 << high; + + while (high > low) { + high--; + size >>= 1; + VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); + + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + if (set_page_guard(zone, &page[size], high, migratetype)) + continue; + + add_to_free_list(&page[size], zone, high, migratetype); + set_buddy_order(&page[size], high); + } +} + +static void check_new_page_bad(struct page *page) +{ + if (unlikely(page->flags & __PG_HWPOISON)) { + /* Don't complain about hwpoisoned pages */ + page_mapcount_reset(page); /* remove PageBuddy */ + return; + } + + bad_page(page, + page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + if (likely(page_expected_state(page, + PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) + return 0; + + check_new_page_bad(page); + return 1; +} + +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + +#ifdef CONFIG_DEBUG_VM +/* + * With DEBUG_VM enabled, order-0 pages are checked for expected state when + * being allocated from pcp lists. With debug_pagealloc also enabled, they are + * also checked when pcp lists are refilled from the free lists. + */ +static inline bool check_pcp_refill(struct page *page, unsigned int order) +{ + if (debug_pagealloc_enabled_static()) + return check_new_pages(page, order); + else + return false; +} + +static inline bool check_new_pcp(struct page *page, unsigned int order) +{ + return check_new_pages(page, order); +} +#else +/* + * With DEBUG_VM disabled, free order-0 pages are checked for expected state + * when pcp lists are being refilled from the free lists. With debug_pagealloc + * enabled, they are also checked when being allocated from the pcp lists. + */ +static inline bool check_pcp_refill(struct page *page, unsigned int order) +{ + return check_new_pages(page, order); +} +static inline bool check_new_pcp(struct page *page, unsigned int order) +{ + if (debug_pagealloc_enabled_static()) + return check_new_pages(page, order); + else + return false; +} +#endif /* CONFIG_DEBUG_VM */ + +static inline bool should_skip_kasan_unpoison(gfp_t flags) +{ + /* Don't skip if a software KASAN mode is enabled. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + return false; + + /* Skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return true; + + /* + * With hardware tag-based KASAN enabled, skip if this has been + * requested via __GFP_SKIP_KASAN_UNPOISON. + */ + return flags & __GFP_SKIP_KASAN_UNPOISON; +} + +static inline bool should_skip_init(gfp_t flags) +{ + /* Don't skip, if hardware tag-based KASAN is not enabled. */ + if (!kasan_hw_tags_enabled()) + return false; + + /* For hardware tag-based KASAN, skip if requested. */ + return (flags & __GFP_SKIP_ZERO); +} + +inline void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags) +{ + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && + !should_skip_init(gfp_flags); + bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); + int i; + + set_page_private(page, 0); + set_page_refcounted(page); + + arch_alloc_page(page, order); + debug_pagealloc_map_pages(page, 1 << order); + + /* + * Page unpoisoning must happen before memory initialization. + * Otherwise, the poison pattern will be overwritten for __GFP_ZERO + * allocations and the page unpoisoning code will complain. + */ + kernel_unpoison_pages(page, 1 << order); + + /* + * As memory initialization might be integrated into KASAN, + * KASAN unpoisoning and memory initializion code must be + * kept together to avoid discrepancies in behavior. + */ + + /* + * If memory tags should be zeroed (which happens only when memory + * should be initialized as well). + */ + if (init_tags) { + /* Initialize both memory and tags. */ + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + + /* Note that memory is already initialized by the loop above. */ + init = false; + } + if (!should_skip_kasan_unpoison(gfp_flags)) { + /* Unpoison shadow memory or set memory tags. */ + kasan_unpoison_pages(page, order, init); + + /* Note that memory is already initialized by KASAN. */ + if (kasan_has_integrated_init()) + init = false; + } else { + /* Ensure page_address() dereferencing does not fault. */ + for (i = 0; i != 1 << order; ++i) + page_kasan_tag_reset(page + i); + } + /* If memory is still not initialized, do it now. */ + if (init) + kernel_init_pages(page, 1 << order); + /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ + if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) + SetPageSkipKASanPoison(page); + + set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); +} + +static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) +{ + post_alloc_hook(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + + /* + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to + * allocate the page. The expectation is that the caller is taking + * steps that will free more memory. The caller should avoid the page + * being used for !PFMEMALLOC purposes. + */ + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); +} + +/* + * Go through the free lists for the given migratetype and remove + * the smallest available page from the freelists + */ +static __always_inline +struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, + int migratetype) +{ + unsigned int current_order; + struct free_area *area; + struct page *page; + + /* Find a page of the appropriate size in the preferred list */ + for (current_order = order; current_order < MAX_ORDER; ++current_order) { + area = &(zone->free_area[current_order]); + page = get_page_from_free_area(area, migratetype); + if (!page) + continue; + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, migratetype); + set_pcppage_migratetype(page, migratetype); + trace_mm_page_alloc_zone_locked(page, order, migratetype, + pcp_allowed_order(order) && + migratetype < MIGRATE_PCPTYPES); + return page; + } + + return NULL; +} + + +/* + * This array describes the order lists are fallen back to when + * the free lists for the desirable migrate type are depleted + * + * The other migratetypes do not have fallbacks. + */ +static int fallbacks[MIGRATE_TYPES][3] = { + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, +}; + +#ifdef CONFIG_CMA +static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) +{ + return __rmqueue_smallest(zone, order, MIGRATE_CMA); +} +#else +static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) { return NULL; } +#endif + +/* + * Move the free pages in a range to the freelist tail of the requested type. + * Note that start_page and end_pages are not aligned on a pageblock + * boundary. If alignment is required, use move_freepages_block() + */ +static int move_freepages(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int *num_movable) +{ + struct page *page; + unsigned long pfn; + unsigned int order; + int pages_moved = 0; + + for (pfn = start_pfn; pfn <= end_pfn;) { + page = pfn_to_page(pfn); + if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + pfn++; + continue; + } + + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + + order = buddy_order(page); + move_to_free_list(page, zone, order, migratetype); + pfn += 1 << order; + pages_moved += 1 << order; + } + + return pages_moved; +} + +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype, int *num_movable) +{ + unsigned long start_pfn, end_pfn, pfn; + + if (num_movable) + *num_movable = 0; + + pfn = page_to_pfn(page); + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; + + /* Do not cross zone boundaries */ + if (!zone_spans_pfn(zone, start_pfn)) + start_pfn = pfn; + if (!zone_spans_pfn(zone, end_pfn)) + return 0; + + return move_freepages(zone, start_pfn, end_pfn, migratetype, + num_movable); +} + +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + +/* + * When we are falling back to another migratetype during allocation, try to + * steal extra free pages from the same pageblocks to satisfy further + * allocations, instead of polluting multiple pageblocks. + * + * If we are stealing a relatively large buddy page, it is likely there will + * be more free pages in the pageblock, so try to steal them all. For + * reclaimable and unmovable allocations, we steal regardless of page size, + * as fragmentation caused by those allocations polluting movable pageblocks + * is worse than movable allocations stealing from unmovable and reclaimable + * pageblocks. + */ +static bool can_steal_fallback(unsigned int order, int start_mt) +{ + /* + * Leaving this order check is intended, although there is + * relaxed order check in next check. The reason is that + * we can actually steal whole pageblock if this condition met, + * but, below check doesn't guarantee it and that is just heuristic + * so could be changed anytime. + */ + if (order >= pageblock_order) + return true; + + if (order >= pageblock_order / 2 || + start_mt == MIGRATE_RECLAIMABLE || + start_mt == MIGRATE_UNMOVABLE || + page_group_by_mobility_disabled) + return true; + + return false; +} + +static inline bool boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return false; + /* + * Don't bother in zones that are unlikely to produce results. + * On small machines, including kdump capture kernels running + * in a small area, boosting the watermark can cause an out of + * memory situation immediately. + */ + if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) + return false; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return false; + + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); + + return true; +} + +/* + * This function implements actual steal behaviour. If order is large enough, + * we can steal whole pageblock. If not, we first move freepages in this + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. + */ +static void steal_suitable_fallback(struct zone *zone, struct page *page, + unsigned int alloc_flags, int start_type, bool whole_block) +{ + unsigned int current_order = buddy_order(page); + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic(old_block_type)) + goto single_page; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { + change_pageblock_range(page, current_order, start_type); + goto single_page; + } + + /* + * Boost watermarks to increase reclaim pressure to reduce the + * likelihood of future fallbacks. Wake kswapd now as the node + * may be balanced overall and kswapd will not wake naturally. + */ + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) + set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; + + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) + set_pageblock_migratetype(page, start_type); + + return; + +single_page: + move_to_free_list(page, zone, current_order, start_type); +} + +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) +{ + int i; + int fallback_mt; + + if (area->nr_free == 0) + return -1; + + *can_steal = false; + for (i = 0;; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (fallback_mt == MIGRATE_TYPES) + break; + + if (free_area_empty(area, fallback_mt)) + continue; + + if (can_steal_fallback(order, migratetype)) + *can_steal = true; + + if (!only_stealable) + return fallback_mt; + + if (*can_steal) + return fallback_mt; + } + + return -1; +} + +/* + * Reserve a pageblock for exclusive use of high-order atomic allocations if + * there are no empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, + unsigned int alloc_order) +{ + int mt; + unsigned long max_managed, flags; + + /* + * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * Check is race-prone but harmless. + */ + max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + /* Only reserve normal pageblocks (i.e., they can merge with others) */ + if (migratetype_is_mergeable(mt)) { + zone->nr_reserved_highatomic += pageblock_nr_pages; + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + * + * If @force is true, try to unreserve a pageblock even though highatomic + * pageblock is exhausted. + */ +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + bool ret; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, + ac->nodemask) { + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &(zone->free_area[order]); + + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); + if (!page) + continue; + + /* + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop although we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. + */ + if (is_migrate_highatomic_page(page)) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + set_pageblock_migratetype(page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); + if (ret) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + } + + return false; +} + +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + * + * The use of signed ints for order and current_order is a deliberate + * deviation from the rest of this file, to make the for loop + * condition simpler. + */ +static __always_inline bool +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + unsigned int alloc_flags) +{ + struct free_area *area; + int current_order; + int min_order = order; + struct page *page; + int fallback_mt; + bool can_steal; + + /* + * Do not steal pages from freelists belonging to other pageblocks + * i.e. orders < pageblock_order. If there are no local zones free, + * the zonelists will be reiterated without ALLOC_NOFRAGMENT. + */ + if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) + min_order = pageblock_order; + + /* + * Find the largest available free page in the other list. This roughly + * approximates finding the pageblock with the most free pages, which + * would be too costly to do exactly. + */ + for (current_order = MAX_ORDER - 1; current_order >= min_order; + --current_order) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt == -1) + continue; + + /* + * We cannot steal all free pages from the pageblock and the + * requested migratetype is movable. In that case it's better to + * steal and split the smallest available page instead of the + * largest available page, because even if the next movable + * allocation falls back into a different pageblock than this + * one, it won't cause permanent fragmentation. + */ + if (!can_steal && start_migratetype == MIGRATE_MOVABLE + && current_order > order) + goto find_smallest; + + goto do_steal; + } + + return false; + +find_smallest: + for (current_order = order; current_order < MAX_ORDER; + current_order++) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt != -1) + break; + } + + /* + * This should not happen - we already found a suitable fallback + * when looking for the largest page. + */ + VM_BUG_ON(current_order == MAX_ORDER); + +do_steal: + page = get_page_from_free_area(area, fallback_mt); + + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, + can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + + return true; + +} + +/* + * Do the hard work of removing an element from the buddy allocator. + * Call me with the zone->lock already held. + */ +static __always_inline struct page * +__rmqueue(struct zone *zone, unsigned int order, int migratetype, + unsigned int alloc_flags) +{ + struct page *page; + + if (IS_ENABLED(CONFIG_CMA)) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (alloc_flags & ALLOC_CMA && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + return page; + } + } +retry: + page = __rmqueue_smallest(zone, order, migratetype); + if (unlikely(!page)) { + if (alloc_flags & ALLOC_CMA) + page = __rmqueue_cma_fallback(zone, order); + + if (!page && __rmqueue_fallback(zone, order, migratetype, + alloc_flags)) + goto retry; + } + return page; +} + +/* + * Obtain a specified number of elements from the buddy allocator, all under + * a single hold of the lock, for efficiency. Add them to the supplied list. + * Returns the number of new pages which were placed at *list. + */ +static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list, + int migratetype, unsigned int alloc_flags) +{ + unsigned long flags; + int i, allocated = 0; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { + struct page *page = __rmqueue(zone, order, migratetype, + alloc_flags); + if (unlikely(page == NULL)) + break; + + if (unlikely(check_pcp_refill(page, order))) + continue; + + /* + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of + * caller's list. From the callers perspective, the linked list + * is ordered by page number under some conditions. This is + * useful for IO devices that can forward direction from the + * head, thus also in the physical page order. This is useful + * for IO devices that can merge IO requests if the physical + * pages are ordered properly. + */ + list_add_tail(&page->pcp_list, list); + allocated++; + if (is_migrate_cma(get_pcppage_migratetype(page))) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); + } + + /* + * i pages were removed from the buddy list even if some leak due + * to check_pcp_refill failing so adjust NR_FREE_PAGES based + * on i. Do not confuse with 'allocated' which is the number of + * pages added to the pcp list. + */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + spin_unlock_irqrestore(&zone->lock, flags); + return allocated; +} + +#ifdef CONFIG_NUMA +/* + * Called from the vmstat counter updater to drain pagesets of this + * currently executing processor on remote nodes after they have + * expired. + */ +void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) +{ + int to_drain, batch; + + batch = READ_ONCE(pcp->batch); + to_drain = min(pcp->count, batch); + if (to_drain > 0) { + spin_lock(&pcp->lock); + free_pcppages_bulk(zone, to_drain, pcp, 0); + spin_unlock(&pcp->lock); + } +} +#endif + +/* + * Drain pcplists of the indicated processor and zone. + */ +static void drain_pages_zone(unsigned int cpu, struct zone *zone) +{ + struct per_cpu_pages *pcp; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + if (pcp->count) { + spin_lock(&pcp->lock); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock(&pcp->lock); + } +} + +/* + * Drain pcplists of all zones on the indicated processor. + */ +static void drain_pages(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + drain_pages_zone(cpu, zone); + } +} + +/* + * Spill all of this CPU's per-cpu pages back into the buddy allocator. + */ +void drain_local_pages(struct zone *zone) +{ + int cpu = smp_processor_id(); + + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); +} + +/* + * The implementation of drain_all_pages(), exposing an extra parameter to + * drain on all cpus. + * + * drain_all_pages() is optimized to only execute on cpus where pcplists are + * not empty. The check for non-emptiness can however race with a free to + * pcplist that has not yet increased the pcp->count from 0 to 1. Callers + * that need the guarantee that every CPU has drained can disable the + * optimizing racy check. + */ +static void __drain_all_pages(struct zone *zone, bool force_all_cpus) +{ + int cpu; + + /* + * Allocate in the BSS so we won't require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * Do not drain if one is already in progress unless it's specific to + * a zone. Such callers are primarily CMA and memory hotplug and need + * the drain to be complete when the call returns. + */ + if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { + if (!zone) + return; + mutex_lock(&pcpu_drain_mutex); + } + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + struct per_cpu_pages *pcp; + struct zone *z; + bool has_pcps = false; + + if (force_all_cpus) { + /* + * The pcp.count check is racy, some callers need a + * guarantee that no cpu is missed. + */ + has_pcps = true; + } else if (zone) { + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + if (pcp->count) + has_pcps = true; + } else { + for_each_populated_zone(z) { + pcp = per_cpu_ptr(z->per_cpu_pageset, cpu); + if (pcp->count) { + has_pcps = true; + break; + } + } + } + + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + + for_each_cpu(cpu, &cpus_with_pcps) { + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); + } + + mutex_unlock(&pcpu_drain_mutex); +} + +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * When zone parameter is non-NULL, spill just the single zone's pages. + */ +void drain_all_pages(struct zone *zone) +{ + __drain_all_pages(zone, false); +} + +#ifdef CONFIG_HIBERNATION + +/* + * Touch the watchdog for every WD_PAGE_COUNT pages. + */ +#define WD_PAGE_COUNT (128*1024) + +void mark_free_pages(struct zone *zone) +{ + unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; + unsigned long flags; + unsigned int order, t; + struct page *page; + + if (zone_is_empty(zone)) + return; + + spin_lock_irqsave(&zone->lock, flags); + + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + + if (page_zone(page) != zone) + continue; + + if (!swsusp_page_is_forbidden(page)) + swsusp_unset_page_free(page); + } + + for_each_migratetype_order(order, t) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], buddy_list) { + unsigned long i; + + pfn = page_to_pfn(page); + for (i = 0; i < (1UL << order); i++) { + if (!--page_count) { + touch_nmi_watchdog(); + page_count = WD_PAGE_COUNT; + } + swsusp_set_page_free(pfn_to_page(pfn + i)); + } + } + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif /* CONFIG_PM */ + +static bool free_unref_page_prepare(struct page *page, unsigned long pfn, + unsigned int order) +{ + int migratetype; + + if (!free_pcp_prepare(page, order)) + return false; + + migratetype = get_pfnblock_migratetype(page, pfn); + set_pcppage_migratetype(page, migratetype); + return true; +} + +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, + bool free_high) +{ + int min_nr_free, max_nr_free; + + /* Free everything if batch freeing high-order pages. */ + if (unlikely(free_high)) + return pcp->count; + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* Leave at least pcp->batch pages on the list */ + min_nr_free = batch; + max_nr_free = high - batch; + + /* + * Double the number of pages freed each time there is subsequent + * freeing of pages without any allocation. + */ + batch <<= pcp->free_factor; + if (batch < max_nr_free) + pcp->free_factor++; + batch = clamp(batch, min_nr_free, max_nr_free); + + return batch; +} + +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, + bool free_high) +{ + int high = READ_ONCE(pcp->high); + + if (unlikely(!high || free_high)) + return 0; + + if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + return high; + + /* + * If reclaim is active, limit the number of pages that can be + * stored on pcp lists + */ + return min(READ_ONCE(pcp->batch) << 2, high); +} + +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, + unsigned int order) +{ + int high; + int pindex; + bool free_high; + + __count_vm_events(PGFREE, 1 << order); + pindex = order_to_pindex(migratetype, order); + list_add(&page->pcp_list, &pcp->lists[pindex]); + pcp->count += 1 << order; + + /* + * As high-order pages other than THP's stored on PCP can contribute + * to fragmentation, limit the number stored when PCP is heavily + * freeing without allocation. The remainder after bulk freeing + * stops will be drained from vmstat refresh context. + */ + free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); + + high = nr_pcp_high(pcp, zone, free_high); + if (pcp->count >= high) { + int batch = READ_ONCE(pcp->batch); + + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); + } +} + +/* + * Free a pcp page + */ +void free_unref_page(struct page *page, unsigned int order) +{ + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; + unsigned long pfn = page_to_pfn(page); + int migratetype, pcpmigratetype; + + if (!free_unref_page_prepare(page, pfn, order)) + return; + + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Place ISOLATE pages on the isolated list because they are being + * offlined but treat HIGHATOMIC and CMA as movable pages so we can + * get those areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + migratetype = pcpmigratetype = get_pcppage_migratetype(page); + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { + if (unlikely(is_migrate_isolate(migratetype))) { + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); + return; + } + pcpmigratetype = MIGRATE_MOVABLE; + } + + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (pcp) { + free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); + pcp_spin_unlock(pcp); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); +} + +/* + * Free a list of 0-order pages + */ +void free_unref_page_list(struct list_head *list) +{ + unsigned long __maybe_unused UP_flags; + struct page *page, *next; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; + int batch_count = 0; + int migratetype; + + /* Prepare pages for freeing */ + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_to_pfn(page); + if (!free_unref_page_prepare(page, pfn, 0)) { + list_del(&page->lru); + continue; + } + + /* + * Free isolated pages directly to the allocator, see + * comment in free_unref_page. + */ + migratetype = get_pcppage_migratetype(page); + if (unlikely(is_migrate_isolate(migratetype))) { + list_del(&page->lru); + free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + continue; + } + } + + list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + list_del(&page->lru); + migratetype = get_pcppage_migratetype(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } + + /* + * trylock is necessary as pages may be getting freed + * from IRQ or SoftIRQ context after an IO completion. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); + free_one_page(zone, page, page_to_pfn(page), + 0, migratetype, FPI_NONE); + locked_zone = NULL; + continue; + } + locked_zone = zone; + batch_count = 0; + } + + /* + * Non-isolated types over MIGRATE_PCPTYPES get added + * to the MIGRATE_MOVABLE pcp list. + */ + if (unlikely(migratetype >= MIGRATE_PCPTYPES)) + migratetype = MIGRATE_MOVABLE; + + trace_mm_page_free_batched(page); + free_unref_page_commit(zone, pcp, page, migratetype, 0); + + /* + * Guard against excessive lock hold times when freeing + * a large list of pages. Lock will be reacquired if + * necessary on the next iteration. + */ + if (++batch_count == SWAP_CLUSTER_MAX) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + batch_count = 0; + pcp = NULL; + locked_zone = NULL; + } + } + + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } +} + +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1<_watermark[WMARK_MIN] + (1UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return 0; + + __mod_zone_freepage_state(zone, -(1UL << order), mt); + } + + del_page_from_free_list(page, zone, order); + + /* + * Set the pageblock if the isolated page is at least half of a + * pageblock + */ + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) { + int mt = get_pageblock_migratetype(page); + /* + * Only change normal pageblocks (i.e., they can merge + * with others) + */ + if (migratetype_is_mergeable(mt)) + set_pageblock_migratetype(page, + MIGRATE_MOVABLE); + } + } + + return 1UL << order; +} + +/** + * __putback_isolated_page - Return a now-isolated page back where we got it + * @page: Page that was isolated + * @order: Order of the isolated page + * @mt: The page's pageblock's migratetype + * + * This function is meant to return a page pulled from the free lists via + * __isolate_free_page back to the free lists they were pulled from. + */ +void __putback_isolated_page(struct page *page, unsigned int order, int mt) +{ + struct zone *zone = page_zone(page); + + /* zone lock should be held when this function is called */ + lockdep_assert_held(&zone->lock); + + /* Return isolated page to tail of freelist. */ + __free_one_page(page, page_to_pfn(page), zone, order, mt, + FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL); +} + +/* + * Update NUMA hit/miss statistics + */ +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + long nr_account) +{ +#ifdef CONFIG_NUMA + enum numa_stat_item local_stat = NUMA_LOCAL; + + /* skip numa counters update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return; + + if (zone_to_nid(z) != numa_node_id()) + local_stat = NUMA_OTHER; + + if (zone_to_nid(z) == zone_to_nid(preferred_zone)) + __count_numa_events(z, NUMA_HIT, nr_account); + else { + __count_numa_events(z, NUMA_MISS, nr_account); + __count_numa_events(preferred_zone, NUMA_FOREIGN, nr_account); + } + __count_numa_events(z, local_stat, nr_account); +#endif +} + +static __always_inline +struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + unsigned int order, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + unsigned long flags; + + do { + page = NULL; + spin_lock_irqsave(&zone->lock, flags); + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return NULL; + } + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + + return page; +} + +/* Remove page from the per-cpu list, caller must protect the list */ +static inline +struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, + int migratetype, + unsigned int alloc_flags, + struct per_cpu_pages *pcp, + struct list_head *list) +{ + struct page *page; + + do { + if (list_empty(list)) { + int batch = READ_ONCE(pcp->batch); + int alloced; + + /* + * Scale batch relative to order if batch implies + * free pages can be stored on the PCP. Batch can + * be 1 for small zones or for boot pagesets which + * should never store free pages as the pages may + * belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + alloced = rmqueue_bulk(zone, order, + batch, list, + migratetype, alloc_flags); + + pcp->count += alloced << order; + if (unlikely(list_empty(list))) + return NULL; + } + + page = list_first_entry(list, struct page, pcp_list); + list_del(&page->pcp_list); + pcp->count -= 1 << order; + } while (check_new_pcp(page, order)); + + return page; +} + +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + struct page *page; + unsigned long __maybe_unused UP_flags; + + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (!pcp) { + pcp_trylock_finish(UP_flags); + return NULL; + } + + /* + * On allocation, reduce the number of pages that are batch freed. + * See nr_pcp_free() where free_factor is increased for subsequent + * frees. + */ + pcp->free_factor >>= 1; + list = &pcp->lists[order_to_pindex(migratetype, order)]; + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + } + return page; +} + +/* + * Allocate a page from the given zone. + * Use pcplists for THP or "cheap" high-order allocations. + */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory +static inline +struct page *rmqueue(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + + if (likely(pcp_allowed_order(order))) { + /* + * MIGRATE_MOVABLE pcplist could have the pages on CMA area and + * we need to skip it when CMA area isn't allowed. + */ + if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || + migratetype != MIGRATE_MOVABLE) { + page = rmqueue_pcplist(preferred_zone, zone, order, + migratetype, alloc_flags); + if (likely(page)) + goto out; + } + } + + page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, + migratetype); + +out: + /* Separate test+clear to avoid unnecessary atomics */ + if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { + clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + } + + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); + return page; +} + +#ifdef CONFIG_FAIL_PAGE_ALLOC + +static struct { + struct fault_attr attr; + + bool ignore_gfp_highmem; + bool ignore_gfp_reclaim; + u32 min_order; +} fail_page_alloc = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_gfp_reclaim = true, + .ignore_gfp_highmem = true, + .min_order = 1, +}; + +static int __init setup_fail_page_alloc(char *str) +{ + return setup_fault_attr(&fail_page_alloc.attr, str); +} +__setup("fail_page_alloc=", setup_fail_page_alloc); + +static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + int flags = 0; + + if (order < fail_page_alloc.min_order) + return false; + if (gfp_mask & __GFP_NOFAIL) + return false; + if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) + return false; + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* See comment in __should_failslab() */ + if (gfp_mask & __GFP_NOWARN) + flags |= FAULT_NOWARN; + + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_page_alloc_debugfs(void) +{ + umode_t mode = S_IFREG | 0600; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + + debugfs_create_bool("ignore-gfp-wait", mode, dir, + &fail_page_alloc.ignore_gfp_reclaim); + debugfs_create_bool("ignore-gfp-highmem", mode, dir, + &fail_page_alloc.ignore_gfp_highmem); + debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); + + return 0; +} + +late_initcall(fail_page_alloc_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#else /* CONFIG_FAIL_PAGE_ALLOC */ + +static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return false; +} + +#endif /* CONFIG_FAIL_PAGE_ALLOC */ + +noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +{ + return __should_fail_alloc_page(gfp_mask, order); +} +ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); + +static inline long __zone_watermark_unusable_free(struct zone *z, + unsigned int order, unsigned int alloc_flags) +{ + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); + long unusable_free = (1 << order) - 1; + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!alloc_harder)) + unusable_free += z->nr_reserved_highatomic; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + return unusable_free; +} + +/* + * Return true if free base pages are above 'mark'. For high-order checks it + * will return true of the order-0 watermark is reached and there is at least + * one free page of a suitable size. Checking now avoids taking the zone lock + * to check in the allocation paths if no pages are free. + */ +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int highest_zoneidx, unsigned int alloc_flags, + long free_pages) +{ + long min = mark; + int o; + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); + + /* free_pages may go negative - that's OK */ + free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); + + if (alloc_flags & ALLOC_HIGH) + min -= min / 2; + + if (unlikely(alloc_harder)) { + /* + * OOM victims can try even harder than normal ALLOC_HARDER + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; + else + min -= min / 4; + } + + /* + * Check watermarks for an order-0 allocation request. If these + * are not met, then a high-order request also cannot go ahead + * even if a suitable page happened to be free. + */ + if (free_pages <= min + z->lowmem_reserve[highest_zoneidx]) + return false; + + /* If this is an order-0 request then the watermark is fine */ + if (!order) + return true; + + /* For a high-order request, check at least one suitable page is free */ + for (o = order; o < MAX_ORDER; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + + if (!area->nr_free) + continue; + + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { + if (!free_area_empty(area, mt)) + return true; + } + +#ifdef CONFIG_CMA + if ((alloc_flags & ALLOC_CMA) && + !free_area_empty(area, MIGRATE_CMA)) { + return true; + } +#endif + if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) + return true; + } + return false; +} + +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int highest_zoneidx, unsigned int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +static inline bool zone_watermark_fast(struct zone *z, unsigned int order, + unsigned long mark, int highest_zoneidx, + unsigned int alloc_flags, gfp_t gfp_mask) +{ + long free_pages; + + free_pages = zone_page_state(z, NR_FREE_PAGES); + + /* + * Fast check for order-0 only. If this fails then the reserves + * need to be calculated. + */ + if (!order) { + long usable_free; + long reserved; + + usable_free = free_pages; + reserved = __zone_watermark_unusable_free(z, 0, alloc_flags); + + /* reserved may over estimate high-atomic reserves. */ + usable_free -= min(usable_free, reserved); + if (usable_free > mark + z->lowmem_reserve[highest_zoneidx]) + return true; + } + + if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags, + free_pages)) + return true; + /* + * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * when checking the min watermark. The min watermark is the + * point where boosting is ignored so that kswapd is woken up + * when below the low watermark. + */ + if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { + mark = z->_watermark[WMARK_MIN]; + return __zone_watermark_ok(z, order, mark, highest_zoneidx, + alloc_flags, free_pages); + } + + return false; +} + +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int highest_zoneidx) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, + free_pages); +} + +#ifdef CONFIG_NUMA +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= + node_reclaim_distance; +} +#else /* CONFIG_NUMA */ +static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) +{ + return true; +} +#endif /* CONFIG_NUMA */ + +/* + * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid + * fragmentation is subtle. If the preferred zone was HIGHMEM then + * premature use of a lower zone may cause lowmem pressure problems that + * are worse than fragmentation. If the next zone is ZONE_DMA then it is + * probably too small. It only makes sense to spread allocations to avoid + * fragmentation between the Normal and DMA32 zones. + */ +static inline unsigned int +alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) +{ + unsigned int alloc_flags; + + /* + * __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save a branch. + */ + alloc_flags = (__force int) (gfp_mask & __GFP_KSWAPD_RECLAIM); + +#ifdef CONFIG_ZONE_DMA32 + if (!zone) + return alloc_flags; + + if (zone_idx(zone) != ZONE_NORMAL) + return alloc_flags; + + /* + * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and + * the pointer is within zone->zone_pgdat->node_zones[]. Also assume + * on UMA that if Normal is populated then so is DMA32. + */ + BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); + if (nr_online_nodes > 1 && !populated_zone(--zone)) + return alloc_flags; + + alloc_flags |= ALLOC_NOFRAGMENT; +#endif /* CONFIG_ZONE_DMA32 */ + return alloc_flags; +} + +/* Must be called after current_gfp_context() which can change gfp_mask */ +static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, + unsigned int alloc_flags) +{ +#ifdef CONFIG_CMA + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; +#endif + return alloc_flags; +} + +/* + * get_page_from_freelist goes through the zonelist trying to allocate + * a page. + */ +static struct page * +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + const struct alloc_context *ac) +{ + struct zoneref *z; + struct zone *zone; + struct pglist_data *last_pgdat = NULL; + bool last_pgdat_dirty_ok = false; + bool no_fallback; + +retry: + /* + * Scan zonelist, looking for a zone with enough free. + * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. + */ + no_fallback = alloc_flags & ALLOC_NOFRAGMENT; + z = ac->preferred_zoneref; + for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx, + ac->nodemask) { + struct page *page; + unsigned long mark; + + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * When allocating a page cache page for writing, we + * want to get it from a node that is within its dirty + * limit, such that no single node holds more than its + * proportional share of globally allowed dirty pages. + * The dirty limits take into account the node's + * lowmem reserves and high watermark so that kswapd + * should be able to balance it without having to + * write pages from its LRU list. + * + * XXX: For now, allow allocations to potentially + * exceed the per-node dirty limit in the slowpath + * (spread_dirty_pages unset) before going into reclaim, + * which is important when on a NUMA setup the allowed + * nodes are together not big enough to reach the + * global limit. The proper fix for these situations + * will require awareness of nodes in the + * dirty-throttling and the flusher threads. + */ + if (ac->spread_dirty_pages) { + if (last_pgdat != zone->zone_pgdat) { + last_pgdat = zone->zone_pgdat; + last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat); + } + + if (!last_pgdat_dirty_ok) + continue; + } + + if (no_fallback && nr_online_nodes > 1 && + zone != ac->preferred_zoneref->zone) { + int local_nid; + + /* + * If moving to a remote node, retry but allow + * fragmenting fallbacks. Locality is more important + * than fragmentation avoidance. + */ + local_nid = zone_to_nid(ac->preferred_zoneref->zone); + if (zone_to_nid(zone) != local_nid) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (!zone_watermark_fast(zone, order, mark, + ac->highest_zoneidx, alloc_flags, + gfp_mask)) { + int ret; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + + if (!node_reclaim_enabled() || + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) + continue; + + ret = node_reclaim(zone->zone_pgdat, gfp_mask, order); + switch (ret) { + case NODE_RECLAIM_NOSCAN: + /* did not scan */ + continue; + case NODE_RECLAIM_FULL: + /* scanned but unreclaimable */ + continue; + default: + /* did we reclaim enough */ + if (zone_watermark_ok(zone, order, mark, + ac->highest_zoneidx, alloc_flags)) + goto try_this_zone; + + continue; + } + } + +try_this_zone: + page = rmqueue(ac->preferred_zoneref->zone, zone, order, + gfp_mask, alloc_flags, ac->migratetype); + if (page) { + prep_new_page(page, order, gfp_mask, alloc_flags); + + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + reserve_highatomic_pageblock(page, zone, order); + + return page; + } else { +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif + } + } + + /* + * It's possible on a UMA machine to get through all zones that are + * fragmented. If avoiding fragmentation, reset and try again. + */ + if (no_fallback) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + + return NULL; +} + +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) +{ + unsigned int filter = SHOW_MEM_FILTER_NODES; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (tsk_is_oom_victim(current) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + filter &= ~SHOW_MEM_FILTER_NODES; + + __show_mem(filter, nodemask, gfp_zone(gfp_mask)); +} + +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); + + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl", + current->comm, &vaf, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); + va_end(args); + + cpuset_print_current_mems_allowed(); + pr_cont("\n"); + dump_stack(); + warn_alloc_show_mem(gfp_mask, nodemask); +} + +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; +} + +static inline struct page * +__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac, unsigned long *did_some_progress) +{ + struct oom_control oc = { + .zonelist = ac->zonelist, + .nodemask = ac->nodemask, + .memcg = NULL, + .gfp_mask = gfp_mask, + .order = order, + }; + struct page *page; + + *did_some_progress = 0; + + /* + * Acquire the oom lock. If that fails, somebody else is + * making progress for us. + */ + if (!mutex_trylock(&oom_lock)) { + *did_some_progress = 1; + schedule_timeout_uninterruptible(1); + return NULL; + } + + /* + * Go through the zonelist yet one more time, keep very high watermark + * here, this is only to catch a parallel oom killing, we must fail if + * we're still under heavy pressure. But make sure that this reclaim + * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY + * allocation which will never fail due to oom_lock already held. + */ + page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & + ~__GFP_DIRECT_RECLAIM, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); + if (page) + goto out; + + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* + * We have already exhausted all our reclaim opportunities without any + * success so it is time to admit defeat. We will skip the OOM killer + * because it is very likely that the caller has a more reasonable + * fallback than shooting a random task. + * + * The OOM killer may not free memory on a specific node. + */ + if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE)) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->highest_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* Exhausted what can be done so it's blame time */ + if (out_of_memory(&oc) || + WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) { + *did_some_progress = 1; + + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + } +out: + mutex_unlock(&oom_lock); + return page; +} + +/* + * Maximum number of compaction retries with a progress before OOM + * killer is consider as the only way to move forward. + */ +#define MAX_COMPACT_RETRIES 16 + +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + enum compact_priority prio, enum compact_result *compact_result) +{ + struct page *page = NULL; + unsigned long pflags; + unsigned int noreclaim_flag; + + if (!order) + return NULL; + + psi_memstall_enter(&pflags); + delayacct_compact_start(); + noreclaim_flag = memalloc_noreclaim_save(); + + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + prio, &page); + + memalloc_noreclaim_restore(noreclaim_flag); + psi_memstall_leave(&pflags); + delayacct_compact_end(); + + if (*compact_result == COMPACT_SKIPPED) + return NULL; + /* + * At least in one zone compaction wasn't deferred or skipped, so let's + * count a compaction stall + */ + count_vm_event(COMPACTSTALL); + + /* Prep a captured page if available */ + if (page) + prep_new_page(page, order, gfp_mask, alloc_flags); + + /* Try get a page from the freelist if available */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + + if (page) { + struct zone *zone = page_zone(page); + + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; + } + + /* + * It's bad if compaction run occurs and fails. The most likely reason + * is that pages exist, but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + + cond_resched(); + + return NULL; +} + +static inline bool +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int *compaction_retries) +{ + int max_retries = MAX_COMPACT_RETRIES; + int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; + + if (!order) + return false; + + if (fatal_signal_pending(current)) + return false; + + if (compaction_made_progress(compact_result)) + (*compaction_retries)++; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by insufficient priority + */ + if (compaction_failed(compact_result)) + goto check_priority; + + /* + * compaction was skipped because there are not enough order-0 pages + * to work with, so we retry only if it looks like reclaim can help. + */ + if (compaction_needs_reclaim(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } + + /* + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. + * But the next retry should use a higher priority if allowed, so + * we don't just keep bailing out endlessly. + */ + if (compaction_withdrawn(compact_result)) { + goto check_priority; + } + + /* + * !costly requests are much more important than __GFP_RETRY_MAYFAIL + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } + + /* + * Make sure there are attempts at the highest priority if we exhausted + * all retries or failed at the lower priorities. + */ +check_priority: + min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? + MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + + if (*compact_priority > min_priority) { + (*compact_priority)--; + *compaction_retries = 0; + ret = true; + } +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + enum compact_priority prio, enum compact_result *compact_result) +{ + *compact_result = COMPACT_SKIPPED; + return NULL; +} + +static inline bool +should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, + enum compact_result compact_result, + enum compact_priority *compact_priority, + int *compaction_retries) +{ + struct zone *zone; + struct zoneref *z; + + if (!order || order > PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * There are setups with compaction disabled which would prefer to loop + * inside the allocator rather than hit the oom killer prematurely. + * Let's give them a good hope and keep retrying while the order-0 + * watermarks are OK. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { + if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), + ac->highest_zoneidx, alloc_flags)) + return true; + } + return false; +} +#endif /* CONFIG_COMPACTION */ + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map __fs_reclaim_map = + STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); + +static bool __need_reclaim(gfp_t gfp_mask) +{ + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) + return false; + + /* this guy won't enter reclaim */ + if (current->flags & PF_MEMALLOC) + return false; + + if (gfp_mask & __GFP_NOLOCKDEP) + return false; + + return true; +} + +void __fs_reclaim_acquire(unsigned long ip) +{ + lock_acquire_exclusive(&__fs_reclaim_map, 0, 0, NULL, ip); +} + +void __fs_reclaim_release(unsigned long ip) +{ + lock_release(&__fs_reclaim_map, ip); +} + +void fs_reclaim_acquire(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_acquire(_RET_IP_); + +#ifdef CONFIG_MMU_NOTIFIER + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); +#endif + + } +} +EXPORT_SYMBOL_GPL(fs_reclaim_acquire); + +void fs_reclaim_release(gfp_t gfp_mask) +{ + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_release(_RET_IP_); + } +} +EXPORT_SYMBOL_GPL(fs_reclaim_release); +#endif + +/* + * Zonelists may change due to hotplug during allocation. Detect when zonelists + * have been rebuilt so allocation retries. Reader side does not lock and + * retries the allocation if zonelist changes. Writer side is protected by the + * embedded spin_lock. + */ +static DEFINE_SEQLOCK(zonelist_update_seq); + +static unsigned int zonelist_iter_begin(void) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqbegin(&zonelist_update_seq); + + return 0; +} + +static unsigned int check_retry_zonelist(unsigned int seq) +{ + if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return read_seqretry(&zonelist_update_seq, seq); + + return seq; +} + +/* Perform direct synchronous page reclaim */ +static unsigned long +__perform_reclaim(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) +{ + unsigned int noreclaim_flag; + unsigned long progress; + + cond_resched(); + + /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); + fs_reclaim_acquire(gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); + + progress = try_to_free_pages(ac->zonelist, order, gfp_mask, + ac->nodemask); + + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(gfp_mask); + + cond_resched(); + + return progress; +} + +/* The really slow allocator path where we enter direct reclaim */ +static inline struct page * +__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, const struct alloc_context *ac, + unsigned long *did_some_progress) +{ + struct page *page = NULL; + unsigned long pflags; + bool drained = false; + + psi_memstall_enter(&pflags); + *did_some_progress = __perform_reclaim(gfp_mask, order, ac); + if (unlikely(!(*did_some_progress))) + goto out; + +retry: + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists or in high alloc reserves. + * Shrink them and try again + */ + if (!page && !drained) { + unreserve_highatomic_pageblock(ac, false); + drain_all_pages(NULL); + drained = true; + goto retry; + } +out: + psi_memstall_leave(&pflags); + + return page; +} + +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) +{ + struct zoneref *z; + struct zone *zone; + pg_data_t *last_pgdat = NULL; + enum zone_type highest_zoneidx = ac->highest_zoneidx; + + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx, + ac->nodemask) { + if (!managed_zone(zone)) + continue; + if (last_pgdat != zone->zone_pgdat) { + wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx); + last_pgdat = zone->zone_pgdat; + } + } +} + +static inline unsigned int +gfp_to_alloc_flags(gfp_t gfp_mask) +{ + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + + /* + * __GFP_HIGH is assumed to be the same as ALLOC_HIGH + * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save two branches. + */ + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); + BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); + + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + */ + alloc_flags |= (__force int) + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); + + if (gfp_mask & __GFP_ATOMIC) { + /* + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; + /* + * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the + * comment for __cpuset_node_allowed(). + */ + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(current)) && in_task()) + alloc_flags |= ALLOC_HARDER; + + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); + + return alloc_flags; +} + +static bool oom_reserves_allowed(struct task_struct *tsk) +{ + if (!tsk_is_oom_victim(tsk)) + return false; + + /* + * !MMU doesn't have oom reaper so give access to memory reserves + * only to the thread with TIF_MEMDIE set + */ + if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) + return false; + + return true; +} + +/* + * Distinguish requests which really need access to full memory + * reserves from oom victims which can live with a portion of it + */ +static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) +{ + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return 0; + if (gfp_mask & __GFP_MEMALLOC) + return ALLOC_NO_WATERMARKS; + if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + return ALLOC_NO_WATERMARKS; + if (!in_interrupt()) { + if (current->flags & PF_MEMALLOC) + return ALLOC_NO_WATERMARKS; + else if (oom_reserves_allowed(current)) + return ALLOC_OOM; + } + + return 0; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!__gfp_pfmemalloc_flags(gfp_mask); +} + +/* + * Checks whether it makes sense to retry the reclaim to make a forward progress + * for the given allocation request. + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. + * + * Returns true if a retry is viable or false to enter the oom path. + */ +static inline bool +should_reclaim_retry(gfp_t gfp_mask, unsigned order, + struct alloc_context *ac, int alloc_flags, + bool did_some_progress, int *no_progress_loops) +{ + struct zone *zone; + struct zoneref *z; + bool ret = false; + + /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) + *no_progress_loops = 0; + else + (*no_progress_loops)++; + + if (*no_progress_loops > MAX_RECLAIM_RETRIES) + goto out; + + + /* + * Keep reclaiming pages while there is a chance this will lead + * somewhere. If none of the target zones can satisfy our allocation + * request even if all reclaimable pages are considered then we are + * screwed and have to go OOM. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->highest_zoneidx, ac->nodemask) { + unsigned long available; + unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; + + available = reclaimable = zone_reclaimable_pages(zone); + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + + /* + * Would the allocation succeed if we reclaimed all + * reclaimable pages? + */ + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac->highest_zoneidx, alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { + ret = true; + break; + } + } + + /* + * Memory allocation/reclaim might be called from a WQ context and the + * current implementation of the WQ concurrency control doesn't + * recognize that a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); +out: + /* Before OOM, exhaust highatomic_reserve */ + if (!ret) + return unreserve_highatomic_pageblock(ac, true); + + return ret; +} + +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ + /* + * It's possible that cpuset's mems_allowed and the nodemask from + * mempolicy don't intersect. This should be normally dealt with by + * policy_nodemask(), but it's possible to race with cpuset update in + * such a way the check therein was true, and then it became false + * before we got our cpuset_mems_cookie here. + * This assumes that for all allocations, ac->nodemask can come only + * from MPOL_BIND mempolicy (whose documented semantics is to be ignored + * when it does not intersect with the cpuset restrictions) or the + * caller can deal with a violated nodemask. + */ + if (cpusets_enabled() && ac->nodemask && + !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { + ac->nodemask = NULL; + return true; + } + + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + return true; + + return false; +} + +static inline struct page * +__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + struct alloc_context *ac) +{ + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; + struct page *page = NULL; + unsigned int alloc_flags; + unsigned long did_some_progress; + enum compact_priority compact_priority; + enum compact_result compact_result; + int compaction_retries; + int no_progress_loops; + unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; + int reserve_flags; + + /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + +restart: + compaction_retries = 0; + no_progress_loops = 0; + compact_priority = DEF_COMPACT_PRIORITY; + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist_iter_cookie = zonelist_iter_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + + /* + * We need to recalculate the starting point for the zonelist iterator + * because we might have used different nodemask in the fast path, or + * there was a cpuset modification and we are retrying - otherwise we + * could end up iterating over non-eligible zones endlessly. + */ + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, ac->nodemask); + if (!ac->preferred_zoneref->zone) + goto nopage; + + /* + * Check for insane configurations where the cpuset doesn't contain + * any suitable zone to satisfy the request - e.g. non-movable + * GFP_HIGHUSER allocations from MOVABLE nodes only. + */ + if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) { + struct zoneref *z = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, + &cpuset_current_mems_allowed); + if (!z->zone) + goto nopage; + } + + if (alloc_flags & ALLOC_KSWAPD) + wake_all_kswapds(order, gfp_mask, ac); + + /* + * The adjusted alloc_flags might result in immediate success, so try + * that first + */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + + /* + * For costly allocations, try direct compaction first, as it's likely + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. + */ + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, + INIT_COMPACT_PRIORITY, + &compact_result); + if (page) + goto got_pg; + + /* + * Checks for costly allocations with __GFP_NORETRY, which + * includes some THP page fault allocations + */ + if (costly_order && (gfp_mask & __GFP_NORETRY)) { + /* + * If allocating entire pageblock(s) and compaction + * failed because all zones are below low watermarks + * or is prohibited because it recently failed at this + * order, fail immediately unless the allocator has + * requested compaction and reclaim retry. + * + * Reclaim is + * - potentially very expensive because zones are far + * below their low watermarks or this is part of very + * bursty high order allocations, + * - not guaranteed to help because isolate_freepages() + * may not iterate over freed pages as part of its + * linear scan, and + * - unlikely to make entire pageblocks free on its + * own. + */ + if (compact_result == COMPACT_SKIPPED || + compact_result == COMPACT_DEFERRED) + goto nopage; + + /* + * Looks like reclaim/compaction is worth trying, but + * sync compaction could be very expensive, so keep + * using async compaction. + */ + compact_priority = INIT_COMPACT_PRIORITY; + } + } + +retry: + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ + if (alloc_flags & ALLOC_KSWAPD) + wake_all_kswapds(order, gfp_mask, ac); + + reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); + if (reserve_flags) + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | + (alloc_flags & ALLOC_KSWAPD); + + /* + * Reset the nodemask and zonelist iterators if memory policies can be + * ignored. These allocations are high priority and system rather than + * user oriented. + */ + if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { + ac->nodemask = NULL; + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, ac->nodemask); + } + + /* Attempt with potentially adjusted zonelist and alloc_flags */ + page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + goto got_pg; + + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) + goto nopage; + + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) + goto nopage; + + /* Try direct reclaim and then allocating */ + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, + &did_some_progress); + if (page) + goto got_pg; + + /* Try direct compaction and then allocating */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + compact_priority, &compact_result); + if (page) + goto got_pg; + + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + goto nopage; + + /* + * Do not retry costly high order allocations unless they are + * __GFP_RETRY_MAYFAIL + */ + if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + goto nopage; + + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, + did_some_progress > 0, &no_progress_loops)) + goto retry; + + /* + * It doesn't make any sense to retry for the compaction if the order-0 + * reclaim is not able to make any progress because the current + * implementation of the compaction depends on the sufficient amount + * of free memory (see __compaction_suitable) + */ + if (did_some_progress > 0 && + should_compact_retry(ac, order, alloc_flags, + compact_result, &compact_priority, + &compaction_retries)) + goto retry; + + + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); + if (page) + goto got_pg; + + /* Avoid allocations with no watermarks from looping endlessly */ + if (tsk_is_oom_victim(current) && + (alloc_flags & ALLOC_OOM || + (gfp_mask & __GFP_NOMEMALLOC))) + goto nopage; + + /* Retry as long as the OOM killer is making progress */ + if (did_some_progress) { + no_progress_loops = 0; + goto retry; + } + +nopage: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * a unnecessary OOM kill. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE_GFP(costly_order, gfp_mask); + + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + + cond_resched(); + goto retry; + } +fail: + warn_alloc(gfp_mask, ac->nodemask, + "page allocation failure: order:%u", order); +got_pg: + return page; +} + +static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + int preferred_nid, nodemask_t *nodemask, + struct alloc_context *ac, gfp_t *alloc_gfp, + unsigned int *alloc_flags) +{ + ac->highest_zoneidx = gfp_zone(gfp_mask); + ac->zonelist = node_zonelist(preferred_nid, gfp_mask); + ac->nodemask = nodemask; + ac->migratetype = gfp_migratetype(gfp_mask); + + if (cpusets_enabled()) { + *alloc_gfp |= __GFP_HARDWALL; + /* + * When we are in the interrupt context, it is irrelevant + * to the current task context. It means that any node ok. + */ + if (in_task() && !ac->nodemask) + ac->nodemask = &cpuset_current_mems_allowed; + else + *alloc_flags |= ALLOC_CPUSET; + } + + might_alloc(gfp_mask); + + if (should_fail_alloc_page(gfp_mask, order)) + return false; + + *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); + + /* Dirty zone balancing only done in the fast path */ + ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); + + /* + * The preferred zone is used for statistics but crucially it is + * also used as the starting point for the zonelist iterator. It + * may get reset for allocations that ignore memory policies. + */ + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->highest_zoneidx, ac->nodemask); + + return true; +} + +/* + * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array + * @gfp: GFP flags for the allocation + * @preferred_nid: The preferred NUMA node ID to allocate from + * @nodemask: Set of nodes to allocate from, may be NULL + * @nr_pages: The number of pages desired on the list or array + * @page_list: Optional list to store the allocated pages + * @page_array: Optional array to store the pages + * + * This is a batched version of the page allocator that attempts to + * allocate nr_pages quickly. Pages are added to page_list if page_list + * is not NULL, otherwise it is assumed that the page_array is valid. + * + * For lists, nr_pages is the number of pages that should be allocated. + * + * For arrays, only NULL elements are populated with pages and nr_pages + * is the maximum number of pages that will be stored in the array. + * + * Returns the number of pages on the list or array. + */ +unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array) +{ + struct page *page; + unsigned long __maybe_unused UP_flags; + struct zone *zone; + struct zoneref *z; + struct per_cpu_pages *pcp; + struct list_head *pcp_list; + struct alloc_context ac; + gfp_t alloc_gfp; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + int nr_populated = 0, nr_account = 0; + + /* + * Skip populated array elements to determine if any pages need + * to be allocated before disabling IRQs. + */ + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) + nr_populated++; + + /* No pages requested? */ + if (unlikely(nr_pages <= 0)) + goto out; + + /* Already populated array? */ + if (unlikely(page_array && nr_pages - nr_populated == 0)) + goto out; + + /* Bulk allocator does not support memcg accounting. */ + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT)) + goto failed; + + /* Use the single page allocator for one page. */ + if (nr_pages - nr_populated == 1) + goto failed; + +#ifdef CONFIG_PAGE_OWNER + /* + * PAGE_OWNER may recurse into the allocator to allocate space to + * save the stack with pagesets.lock held. Releasing/reacquiring + * removes much of the performance benefit of bulk allocation so + * force the caller to allocate one page at a time as it'll have + * similar performance to added complexity to the bulk allocator. + */ + if (static_branch_unlikely(&page_owner_inited)) + goto failed; +#endif + + /* May set ALLOC_NOFRAGMENT, fragmentation will return 1 page. */ + gfp &= gfp_allowed_mask; + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, 0, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) + goto out; + gfp = alloc_gfp; + + /* Find an allowed local zone that meets the low watermark. */ + for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + unsigned long mark; + + if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && + !__cpuset_zone_allowed(zone, gfp)) { + continue; + } + + if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone && + zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) { + goto failed; + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; + if (zone_watermark_fast(zone, 0, mark, + zonelist_zone_idx(ac.preferred_zoneref), + alloc_flags, gfp)) { + break; + } + } + + /* + * If there are no allowed local zones that meets the watermarks then + * try to allocate a single page and reclaim if necessary. + */ + if (unlikely(!zone)) + goto failed; + + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (!pcp) + goto failed_irq; + + /* Attempt the batch allocation */ + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; + while (nr_populated < nr_pages) { + + /* Skip existing pages */ + if (page_array && page_array[nr_populated]) { + nr_populated++; + continue; + } + + page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, + pcp, pcp_list); + if (unlikely(!page)) { + /* Try and allocate at least one page */ + if (!nr_account) { + pcp_spin_unlock(pcp); + goto failed_irq; + } + break; + } + nr_account++; + + prep_new_page(page, 0, gfp, 0); + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + + __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); + zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); + +out: + return nr_populated; + +failed_irq: + pcp_trylock_finish(UP_flags); + +failed: + page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + if (page) { + if (page_list) + list_add(&page->lru, page_list); + else + page_array[nr_populated] = page; + nr_populated++; + } + + goto out; +} +EXPORT_SYMBOL_GPL(__alloc_pages_bulk); + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { }; + + /* + * There are several places where we assume that the order value is sane + * so bail out early if the request is out of bound. + */ + if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp)) + return NULL; + + gfp &= gfp_allowed_mask; + /* + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures + * movable zones are not used during allocation. + */ + gfp = current_gfp_context(gfp); + alloc_gfp = gfp; + if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, + &alloc_gfp, &alloc_flags)) + return NULL; + + /* + * Forbid the first pass from falling back to types that fragment + * memory until all local zones are considered. + */ + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp); + + /* First allocation attempt */ + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); + if (likely(page)) + goto out; + + alloc_gfp = gfp; + ac.spread_dirty_pages = false; + + /* + * Restore the original nodemask if it was potentially replaced with + * &cpuset_current_mems_allowed to optimize the fast-path attempt. + */ + ac.nodemask = nodemask; + + page = __alloc_pages_slowpath(alloc_gfp, order, &ac); + +out: + if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT) && page && + unlikely(__memcg_kmem_charge_page(page, gfp, order) != 0)) { + __free_pages(page, order); + page = NULL; + } + + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); + + return page; +} +EXPORT_SYMBOL(__alloc_pages); + +struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) +{ + struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + preferred_nid, nodemask); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; +} +EXPORT_SYMBOL(__folio_alloc); + +/* + * Common helper functions. Never use with __GFP_HIGHMEM because the returned + * address cannot represent highmem pages. Use alloc_pages and then kmap if + * you need to access high mem. + */ +unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} +EXPORT_SYMBOL(__get_free_pages); + +unsigned long get_zeroed_page(gfp_t gfp_mask) +{ + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); +} +EXPORT_SYMBOL(get_zeroed_page); + +/** + * __free_pages - Free pages allocated with alloc_pages(). + * @page: The page pointer returned from alloc_pages(). + * @order: The order of the allocation. + * + * This function can free multi-page allocations that are not compound + * pages. It does not check that the @order passed in matches that of + * the allocation, so it is easy to leak memory. Freeing more memory + * than was allocated will probably emit a warning. + * + * If the last reference to this page is speculative, it will be released + * by put_page() which only frees the first page of a non-compound + * allocation. To prevent the remaining pages from being leaked, we free + * the subsequent pages here. If you want to use the page's reference + * count to decide when to free the allocation, you should allocate a + * compound page, and use put_page() instead of __free_pages(). + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ +void __free_pages(struct page *page, unsigned int order) +{ + /* get PageHead before we drop reference */ + int head = PageHead(page); + + if (put_page_testzero(page)) + free_the_page(page, order); + else if (!head) + while (order-- > 0) + free_the_page(page + (1 << order), order); +} +EXPORT_SYMBOL(__free_pages); + +void free_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_pages(virt_to_page((void *)addr), order); + } +} + +EXPORT_SYMBOL(free_pages); + +/* + * Page Fragment: + * An arbitrary-length arbitrary-offset area of memory which resides + * within a 0 or higher order page. Multiple fragments within that page + * are individually refcounted, in the page's reference counter. + * + * The page_frag functions below provide a simple allocation framework for + * page fragments. This is used by the network stack and network device + * drivers to provide a backing region of memory for use as either an + * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. + */ +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) +{ + struct page *page = NULL; + gfp_t gfp = gfp_mask; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, + PAGE_FRAG_CACHE_MAX_ORDER); + nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; +#endif + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->va = page ? page_address(page) : NULL; + + return page; +} + +void __page_frag_cache_drain(struct page *page, unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) + free_the_page(page, compound_order(page)); +} +EXPORT_SYMBOL(__page_frag_cache_drain); + +void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) +{ + unsigned int size = PAGE_SIZE; + struct page *page; + int offset; + + if (unlikely(!nc->va)) { +refill: + page = __page_frag_cache_refill(nc, gfp_mask); + if (!page) + return NULL; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); + + /* reset page count bias and offset to start of new frag */ + nc->pfmemalloc = page_is_pfmemalloc(page); + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; + nc->offset = size; + } + + offset = nc->offset - fragsz; + if (unlikely(offset < 0)) { + page = virt_to_page(nc->va); + + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) + goto refill; + + if (unlikely(nc->pfmemalloc)) { + free_the_page(page, compound_order(page)); + goto refill; + } + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* OK, page count is 0, we can safely set it */ + set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; + offset = size - fragsz; + if (unlikely(offset < 0)) { + /* + * The caller is trying to allocate a fragment + * with fragsz > PAGE_SIZE but the cache isn't big + * enough to satisfy the request, this may + * happen in low memory conditions. + * We don't release the cache page because + * it could make memory pressure worse + * so we simply return NULL here. + */ + return NULL; + } + } + + nc->pagecnt_bias--; + offset &= align_mask; + nc->offset = offset; + + return nc->va + offset; +} +EXPORT_SYMBOL(page_frag_alloc_align); + +/* + * Frees a page fragment allocated out of either a compound or order 0 page. + */ +void page_frag_free(void *addr) +{ + struct page *page = virt_to_head_page(addr); + + if (unlikely(put_page_testzero(page))) + free_the_page(page, compound_order(page)); +} +EXPORT_SYMBOL(page_frag_free); + +static void *make_alloc_exact(unsigned long addr, unsigned int order, + size_t size) +{ + if (addr) { + unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); + struct page *page = virt_to_page((void *)addr); + struct page *last = page + nr; + + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); + + last = page + (1UL << order); + for (page += nr; page < last; page++) + __free_pages_ok(page, 0, FPI_TO_TAIL); + } + return (void *)addr; +} + +/** + * alloc_pages_exact - allocate an exact number physically-contiguous pages. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP + * + * This function is similar to alloc_pages(), except that it allocates the + * minimum number of pages to satisfy the request. alloc_pages() can only + * allocate memory in power-of-two pages. + * + * This function is also limited by MAX_ORDER. + * + * Memory allocated by this function must be released by free_pages_exact(). + * + * Return: pointer to the allocated area or %NULL in case of error. + */ +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +{ + unsigned int order = get_order(size); + unsigned long addr; + + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); + + addr = __get_free_pages(gfp_mask, order); + return make_alloc_exact(addr, order, size); +} +EXPORT_SYMBOL(alloc_pages_exact); + +/** + * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * pages on a node. + * @nid: the preferred node ID where memory should be allocated + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP + * + * Like alloc_pages_exact(), but try to allocate on node nid first before falling + * back. + * + * Return: pointer to the allocated area or %NULL in case of error. + */ +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +{ + unsigned int order = get_order(size); + struct page *p; + + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); + + p = alloc_pages_node(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +} + +/** + * free_pages_exact - release memory allocated via alloc_pages_exact() + * @virt: the value returned by alloc_pages_exact. + * @size: size of allocation, same value as passed to alloc_pages_exact(). + * + * Release the memory allocated by a previous call to alloc_pages_exact. + */ +void free_pages_exact(void *virt, size_t size) +{ + unsigned long addr = (unsigned long)virt; + unsigned long end = addr + PAGE_ALIGN(size); + + while (addr < end) { + free_page(addr); + addr += PAGE_SIZE; + } +} +EXPORT_SYMBOL(free_pages_exact); + +/** + * nr_free_zone_pages - count number of pages beyond high watermark + * @offset: The zone index of the highest zone + * + * nr_free_zone_pages() counts the number of pages which are beyond the + * high watermark within all zones at or below a given zone index. For each + * zone, the number of pages is calculated as: + * + * nr_free_zone_pages = managed_pages - high_pages + * + * Return: number of pages beyond high watermark. + */ +static unsigned long nr_free_zone_pages(int offset) +{ + struct zoneref *z; + struct zone *zone; + + /* Just pick one node, since fallback list is circular */ + unsigned long sum = 0; + + struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); + + for_each_zone_zonelist(zone, z, zonelist, offset) { + unsigned long size = zone_managed_pages(zone); + unsigned long high = high_wmark_pages(zone); + if (size > high) + sum += size - high; + } + + return sum; +} + +/** + * nr_free_buffer_pages - count number of pages beyond high watermark + * + * nr_free_buffer_pages() counts the number of pages which are beyond the high + * watermark within ZONE_DMA and ZONE_NORMAL. + * + * Return: number of pages beyond high watermark within ZONE_DMA and + * ZONE_NORMAL. + */ +unsigned long nr_free_buffer_pages(void) +{ + return nr_free_zone_pages(gfp_zone(GFP_USER)); +} +EXPORT_SYMBOL_GPL(nr_free_buffer_pages); + +static inline void show_node(struct zone *zone) +{ + if (IS_ENABLED(CONFIG_NUMA)) + printk("Node %d ", zone_to_nid(zone)); +} + +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + unsigned long reclaimable; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_node_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += low_wmark_pages(zone); + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping or OOM. + */ + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab and other kernel memory consists of + * items that are in use, and cannot be freed. Cap this estimate at the + * low watermark. + */ + reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); + available += reclaimable - min(reclaimable / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + +void si_meminfo(struct sysinfo *val) +{ + val->totalram = totalram_pages(); + val->sharedram = global_node_page_state(NR_SHMEM); + val->freeram = global_zone_page_state(NR_FREE_PAGES); + val->bufferram = nr_blockdev_pages(); + val->totalhigh = totalhigh_pages(); + val->freehigh = nr_free_highpages(); + val->mem_unit = PAGE_SIZE; +} + +EXPORT_SYMBOL(si_meminfo); + +#ifdef CONFIG_NUMA +void si_meminfo_node(struct sysinfo *val, int nid) +{ + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; + pg_data_t *pgdat = NODE_DATA(nid); + + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); +#ifdef CONFIG_HIGHMEM + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone_managed_pages(zone); + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; +#else + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; +#endif + val->mem_unit = PAGE_SIZE; +} +#endif + +/* + * Determine whether the node should be displayed or not, depending on whether + * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). + */ +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) +{ + if (!(flags & SHOW_MEM_FILTER_NODES)) + return false; + + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); +} + +#define K(x) ((x) << (PAGE_SHIFT-10)) + +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif +#ifdef CONFIG_MEMORY_ISOLATION + [MIGRATE_ISOLATE] = 'I', +#endif + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk(KERN_CONT "(%s) ", tmp); +} + +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + +/* + * Show free area list (used inside shift_scroll-lock stuff) + * We also calculate the percentage fragmentation. We do this by counting the + * memory on each free list with the exception of the first item on the list. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. + */ +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +{ + unsigned long free_pcp = 0; + int cpu, nid; + struct zone *zone; + pg_data_t *pgdat; + + for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; + } + + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu\n" + " sec_pagetables:%lu bounce:%lu\n" + " kernel_misc_reclaimable:%lu\n" + " free:%lu free_pcp:%lu free_cma:%lu\n", + global_node_page_state(NR_ACTIVE_ANON), + global_node_page_state(NR_INACTIVE_ANON), + global_node_page_state(NR_ISOLATED_ANON), + global_node_page_state(NR_ACTIVE_FILE), + global_node_page_state(NR_INACTIVE_FILE), + global_node_page_state(NR_ISOLATED_FILE), + global_node_page_state(NR_UNEVICTABLE), + global_node_page_state(NR_FILE_DIRTY), + global_node_page_state(NR_WRITEBACK), + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B), + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM), + global_node_page_state(NR_PAGETABLE), + global_node_page_state(NR_SECONDARY_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE), + global_zone_page_state(NR_FREE_PAGES), + free_pcp, + global_zone_page_state(NR_FREE_CMA_PAGES)); + + for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; + + printk("Node %d" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" + " mapped:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " shmem:%lukB" +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + " shmem_thp: %lukB" + " shmem_pmdmapped: %lukB" + " anon_thp: %lukB" +#endif + " writeback_tmp:%lukB" + " kernel_stack:%lukB" +#ifdef CONFIG_SHADOW_CALL_STACK + " shadow_call_stack:%lukB" +#endif + " pagetables:%lukB" + " sec_pagetables:%lukB" + " all_unreclaimable? %s" + "\n", + pgdat->node_id, + K(node_page_state(pgdat, NR_ACTIVE_ANON)), + K(node_page_state(pgdat, NR_INACTIVE_ANON)), + K(node_page_state(pgdat, NR_ACTIVE_FILE)), + K(node_page_state(pgdat, NR_INACTIVE_FILE)), + K(node_page_state(pgdat, NR_UNEVICTABLE)), + K(node_page_state(pgdat, NR_ISOLATED_ANON)), + K(node_page_state(pgdat, NR_ISOLATED_FILE)), + K(node_page_state(pgdat, NR_FILE_MAPPED)), + K(node_page_state(pgdat, NR_FILE_DIRTY)), + K(node_page_state(pgdat, NR_WRITEBACK)), + K(node_page_state(pgdat, NR_SHMEM)), +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + K(node_page_state(pgdat, NR_SHMEM_THPS)), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), + K(node_page_state(pgdat, NR_ANON_THPS)), +#endif + K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), + node_page_state(pgdat, NR_KERNEL_STACK_KB), +#ifdef CONFIG_SHADOW_CALL_STACK + node_page_state(pgdat, NR_KERNEL_SCS_KB), +#endif + K(node_page_state(pgdat, NR_PAGETABLE)), + K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); + } + + for_each_populated_zone(zone) { + int i; + + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->per_cpu_pageset, cpu)->count; + + show_node(zone); + printk(KERN_CONT + "%s" + " free:%lukB" + " boost:%lukB" + " min:%lukB" + " low:%lukB" + " high:%lukB" + " reserved_highatomic:%luKB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" + " unevictable:%lukB" + " writepending:%lukB" + " present:%lukB" + " managed:%lukB" + " mlocked:%lukB" + " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" + " free_cma:%lukB" + "\n", + zone->name, + K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone->watermark_boost), + K(min_wmark_pages(zone)), + K(low_wmark_pages(zone)), + K(high_wmark_pages(zone)), + K(zone->nr_reserved_highatomic), + K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), + K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), + K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), + K(zone->present_pages), + K(zone_managed_pages(zone)), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->per_cpu_pageset->count)), + K(zone_page_state(zone, NR_FREE_CMA_PAGES))); + printk("lowmem_reserve[]:"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(KERN_CONT " %ld", zone->lowmem_reserve[i]); + printk(KERN_CONT "\n"); + } + + for_each_populated_zone(zone) { + unsigned int order; + unsigned long nr[MAX_ORDER], flags, total = 0; + unsigned char types[MAX_ORDER]; + + if (zone_idx(zone) > max_zone_idx) + continue; + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) + continue; + show_node(zone); + printk(KERN_CONT "%s: ", zone->name); + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; + total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!free_area_empty(area, type)) + types[order] |= 1 << type; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + printk(KERN_CONT "%lu*%lukB ", + nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } + printk(KERN_CONT "= %lukB\n", K(total)); + } + + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } + + printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); + + show_swap_cache_info(); +} + +static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) +{ + zoneref->zone = zone; + zoneref->zone_idx = zone_idx(zone); +} + +/* + * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. + */ +static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) +{ + struct zone *zone; + enum zone_type zone_type = MAX_NR_ZONES; + int nr_zones = 0; + + do { + zone_type--; + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { + zoneref_set_zone(zone, &zonerefs[nr_zones++]); + check_highest_zone(zone_type); + } + } while (zone_type); + + return nr_zones; +} + +#ifdef CONFIG_NUMA + +static int __parse_numa_zonelist_order(char *s) +{ + /* + * We used to support different zonelists modes but they turned + * out to be just not useful. Let's keep the warning in place + * if somebody still use the cmd line parameter so that we do + * not fail it silently + */ + if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { + pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); + return -EINVAL; + } + return 0; +} + +char numa_zonelist_order[] = "Node"; + +/* + * sysctl handler for numa_zonelist_order + */ +int numa_zonelist_order_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); +} + + +static int node_load[MAX_NUMNODES]; + +/** + * find_next_best_node - find the next node that should appear in a given node's fallback list + * @node: node whose fallback list we're appending + * @used_node_mask: nodemask_t of already used nodes + * + * We use a number of factors to determine which is the next node that should + * appear on a given node's fallback list. The node should not have appeared + * already in @node's fallback list, and it should be the next closest node + * according to the distance array (which contains arbitrary distance values + * from each node to each node in the system), and should also prefer nodes + * with no CPUs, since presumably they'll have very little allocation pressure + * on them otherwise. + * + * Return: node id of the found node or %NUMA_NO_NODE if no node is found. + */ +int find_next_best_node(int node, nodemask_t *used_node_mask) +{ + int n, val; + int min_val = INT_MAX; + int best_node = NUMA_NO_NODE; + + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + node_set(node, *used_node_mask); + return node; + } + + for_each_node_state(n, N_MEMORY) { + + /* Don't want a node to appear more than once */ + if (node_isset(n, *used_node_mask)) + continue; + + /* Use the distance array to find the distance */ + val = node_distance(node, n); + + /* Penalize nodes under us ("prefer the next node") */ + val += (n < node); + + /* Give preference to headless and unused nodes */ + if (!cpumask_empty(cpumask_of_node(n))) + val += PENALTY_FOR_NODE_WITH_CPUS; + + /* Slight preference for less loaded node */ + val *= MAX_NUMNODES; + val += node_load[n]; + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node >= 0) + node_set(best_node, *used_node_mask); + + return best_node; +} + + +/* + * Build zonelists ordered by node and zones within node. + * This results in maximum locality--normal zone overflows into local + * DMA zone, if any--but risks exhausting DMA zone. + */ +static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, + unsigned nr_nodes) +{ + struct zoneref *zonerefs; + int i; + + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + + for (i = 0; i < nr_nodes; i++) { + int nr_zones; + + pg_data_t *node = NODE_DATA(node_order[i]); + + nr_zones = build_zonerefs_node(node, zonerefs); + zonerefs += nr_zones; + } + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; +} + +/* + * Build gfp_thisnode zonelists + */ +static void build_thisnode_zonelists(pg_data_t *pgdat) +{ + struct zoneref *zonerefs; + int nr_zones; + + zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; +} + +/* + * Build zonelists ordered by zone and nodes within zones. + * This results in conserving DMA zone[s] until all Normal memory is + * exhausted, but results in overflowing to remote node while memory + * may still exist in local DMA zone. + */ + +static void build_zonelists(pg_data_t *pgdat) +{ + static int node_order[MAX_NUMNODES]; + int node, nr_nodes = 0; + nodemask_t used_mask = NODE_MASK_NONE; + int local_node, prev_node; + + /* NUMA-aware ordering of nodes */ + local_node = pgdat->node_id; + prev_node = local_node; + + memset(node_order, 0, sizeof(node_order)); + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + /* + * We don't want to pressure a particular node. + * So adding penalty to the first node in same + * distance group to make it round-robin. + */ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] += 1; + + node_order[nr_nodes++] = node; + prev_node = node; + } + + build_zonelists_in_node_order(pgdat, node_order, nr_nodes); + build_thisnode_zonelists(pgdat); + pr_info("Fallback order for Node %d: ", local_node); + for (node = 0; node < nr_nodes; node++) + pr_cont("%d ", node_order[node]); + pr_cont("\n"); +} + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ + struct zoneref *z; + + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + gfp_zone(GFP_KERNEL), + NULL); + return zone_to_nid(z->zone); +} +#endif + +static void setup_min_unmapped_ratio(void); +static void setup_min_slab_ratio(void); +#else /* CONFIG_NUMA */ + +static void build_zonelists(pg_data_t *pgdat) +{ + int node, local_node; + struct zoneref *zonerefs; + int nr_zones; + + local_node = pgdat->node_id; + + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < MAX_NUMNODES; node++) { + if (!node_online(node)) + continue; + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; + } + for (node = 0; node < local_node; node++) { + if (!node_online(node)) + continue; + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; + } + + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; +} + +#endif /* CONFIG_NUMA */ + +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats); +/* These effectively disable the pcplists in the boot pageset completely */ +#define BOOT_PAGESET_HIGH 0 +#define BOOT_PAGESET_BATCH 1 +static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + +static void __build_all_zonelists(void *data) +{ + int nid; + int __maybe_unused cpu; + pg_data_t *self = data; + unsigned long flags; + + /* + * Explicitly disable this CPU's interrupts before taking seqlock + * to prevent any IRQ handler from calling into the page allocator + * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + */ + local_irq_save(flags); + /* + * Explicitly disable this CPU's synchronous printk() before taking + * seqlock to prevent any printk() from trying to hold port->lock, for + * tty_insert_flip_string_and_push_buffer() on other CPU might be + * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. + */ + printk_deferred_enter(); + write_seqlock(&zonelist_update_seq); + +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif + + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. + */ + if (self && !node_online(self->node_id)) { + build_zonelists(self); + } else { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + + build_zonelists(pgdat); + } + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * We now know the "local memory node" for each node-- + * i.e., the node of the first zone in the generic zonelist. + * Set up numa_mem percpu variable for on-line cpus. During + * boot, only the boot cpu should be on-line; we'll init the + * secondary cpus' numa_mem as they come on-line. During + * node/memory hotplug, we'll fixup all on-line cpus. + */ + for_each_online_cpu(cpu) + set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif + } + + write_sequnlock(&zonelist_update_seq); + printk_deferred_exit(); + local_irq_restore(flags); +} + +static noinline void __init +build_all_zonelists_init(void) +{ + int cpu; + + __build_all_zonelists(NULL); + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + per_cpu_pages_init(&per_cpu(boot_pageset, cpu), &per_cpu(boot_zonestats, cpu)); + + mminit_verify_zonelist(); + cpuset_init_current_mems_allowed(); +} + +/* + * unless system_state == SYSTEM_BOOTING. + * + * __ref due to call of __init annotated helper build_all_zonelists_init + * [protected by SYSTEM_BOOTING]. + */ +void __ref build_all_zonelists(pg_data_t *pgdat) +{ + unsigned long vm_total_pages; + + if (system_state == SYSTEM_BOOTING) { + build_all_zonelists_init(); + } else { + __build_all_zonelists(pgdat); + /* cpuset refresh routine should be here */ + } + /* Get the number of free pages beyond high watermark in all zones. */ + vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); + /* + * Disable grouping by mobility if the number of pages in the + * system is too low to allow the mechanism to work. It would be + * more accurate, but expensive to check per-zone. This check is + * made on memory-hotadd so a system can start with mobility + * disabled and enable it later + */ + if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) + page_group_by_mobility_disabled = 1; + else + page_group_by_mobility_disabled = 0; + + pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); +#ifdef CONFIG_NUMA + pr_info("Policy zone: %s\n", zone_names[policy_zone]); +#endif +} + +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ + static struct memblock_region *r; + + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { + for_each_mem_region(r) { + if (*pfn < memblock_region_memory_end_pfn(r)) + break; + } + } + if (*pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + *pfn = memblock_region_memory_end_pfn(r); + return true; + } + } + return false; +} + +/* + * Initially all pages are reserved - free ones are freed + * up by memblock_free_all() once the early boot process is + * done. Non-atomic initialization, single-pass. + * + * All aligned pageblocks are initialized to the specified migratetype + * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related + * zone stats (e.g., nr_isolate_pageblock) are touched. + */ +void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, + unsigned long start_pfn, unsigned long zone_end_pfn, + enum meminit_context context, + struct vmem_altmap *altmap, int migratetype) +{ + unsigned long pfn, end_pfn = start_pfn + size; + struct page *page; + + if (highest_memmap_pfn < end_pfn - 1) + highest_memmap_pfn = end_pfn - 1; + +#ifdef CONFIG_ZONE_DEVICE + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory. We limit the total number of pages to initialize to just + * those that might contain the memory mapping. We will defer the + * ZONE_DEVICE page initialization until after we have released + * the hotplug lock. + */ + if (zone == ZONE_DEVICE) { + if (!altmap) + return; + + if (start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + } +#endif + + for (pfn = start_pfn; pfn < end_pfn; ) { + /* + * There can be holes in boot-time mem_map[]s handed to this + * function. They do not exist on hotplugged memory. + */ + if (context == MEMINIT_EARLY) { + if (overlap_memmap_init(zone, &pfn)) + continue; + if (defer_init(nid, pfn, zone_end_pfn)) + break; + } + + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zone, nid); + if (context == MEMINIT_HOTPLUG) + __SetPageReserved(page); + + /* + * Usually, we want to mark the pageblock MIGRATE_MOVABLE, + * such that unmovable allocations won't be scattered all + * over the place during system boot. + */ + if (pageblock_aligned(pfn)) { + set_pageblock_migratetype(page, migratetype); + cond_resched(); + } + pfn++; + } +} + +#ifdef CONFIG_ZONE_DEVICE +static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) +{ + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap + * because this is done early in section_activate() + */ + if (pageblock_aligned(pfn)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); +} + +/* + * With compound page geometry and when struct pages are stored in ram most + * tail pages are reused. Consequently, the amount of unique struct pages to + * initialize is a lot smaller that the total amount of struct pages being + * mapped. This is a paired / mild layering violation with explicit knowledge + * of how the sparse_vmemmap internals handle compound pages in the lack + * of an altmap. See vmemmap_populate_compound_pages(). + */ +static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, + unsigned long nr_pages) +{ + return is_power_of_2(sizeof(struct page)) && + !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages; +} + +static void __ref memmap_init_compound(struct page *head, + unsigned long head_pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap, + unsigned long nr_pages) +{ + unsigned long pfn, end_pfn = head_pfn + nr_pages; + unsigned int order = pgmap->vmemmap_shift; + + __SetPageHead(head); + for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + prep_compound_tail(head, pfn - head_pfn); + set_page_count(page, 0); + + /* + * The first tail page stores compound_mapcount_ptr() and + * compound_order() and the second tail page stores + * compound_pincount_ptr(). Call prep_compound_head() after + * the first and second tail pages have been initialized to + * not have the data overwritten. + */ + if (pfn == head_pfn + 2) + prep_compound_head(head, order); + } +} + +void __ref memmap_init_zone_device(struct zone *zone, + unsigned long start_pfn, + unsigned long nr_pages, + struct dev_pagemap *pgmap) +{ + unsigned long pfn, end_pfn = start_pfn + nr_pages; + struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); + unsigned long zone_idx = zone_idx(zone); + unsigned long start = jiffies; + int nid = pgdat->node_id; + + if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) + return; + + /* + * The call to memmap_init should have already taken care + * of the pages reserved for the memmap, so we can just jump to + * the end of that region and start processing the device pages. + */ + if (altmap) { + start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); + nr_pages = end_pfn - start_pfn; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + + if (pfns_per_compound == 1) + continue; + + memmap_init_compound(page, pfn, zone_idx, nid, pgmap, + compound_nr_pages(altmap, pfns_per_compound)); + } + + pr_info("%s initialised %lu pages in %ums\n", __func__, + nr_pages, jiffies_to_msecs(jiffies - start)); +} + +#endif +static void __meminit zone_init_free_lists(struct zone *zone) +{ + unsigned int order, t; + for_each_migratetype_order(order, t) { + INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); + zone->free_area[order].nr_free = 0; + } +} + +/* + * Only struct pages that correspond to ranges defined by memblock.memory + * are zeroed and initialized by going through __init_single_page() during + * memmap_init_zone_range(). + * + * But, there could be struct pages that correspond to holes in + * memblock.memory. This can happen because of the following reasons: + * - physical memory bank size is not necessarily the exact multiple of the + * arbitrary section size + * - early reserved memory may not be listed in memblock.memory + * - memory layouts defined with memmap= kernel parameter may not align + * nicely with memmap sections + * + * Explicitly initialize those struct pages so that: + * - PG_Reserved is set + * - zone and node links point to zone and node that span the page if the + * hole is in the middle of a zone + * - zone and node links point to adjacent zone/node if the hole falls on + * the zone boundary; the pages in such holes will be prepended to the + * zone/node above the hole except for the trailing pages in the last + * section that will be appended to the zone/node below. + */ +static void __init init_unavailable_range(unsigned long spfn, + unsigned long epfn, + int zone, int node) +{ + unsigned long pfn; + u64 pgcnt = 0; + + for (pfn = spfn; pfn < epfn; pfn++) { + if (!pfn_valid(pageblock_start_pfn(pfn))) { + pfn = pageblock_end_pfn(pfn) - 1; + continue; + } + __init_single_page(pfn_to_page(pfn), pfn, zone, node); + __SetPageReserved(pfn_to_page(pfn)); + pgcnt++; + } + + if (pgcnt) + pr_info("On node %d, zone %s: %lld pages in unavailable ranges", + node, zone_names[zone], pgcnt); +} + +static void __init memmap_init_zone_range(struct zone *zone, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *hole_pfn) +{ + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); + + if (start_pfn >= end_pfn) + return; + + memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + + if (*hole_pfn < start_pfn) + init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); + + *hole_pfn = end_pfn; +} + +static void __init memmap_init(void) +{ + unsigned long start_pfn, end_pfn; + unsigned long hole_pfn = 0; + int i, j, zone_id = 0, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + struct pglist_data *node = NODE_DATA(nid); + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = node->node_zones + j; + + if (!populated_zone(zone)) + continue; + + memmap_init_zone_range(zone, start_pfn, end_pfn, + &hole_pfn); + zone_id = j; + } + } + +#ifdef CONFIG_SPARSEMEM + /* + * Initialize the memory map for hole in the range [memory_end, + * section_end]. + * Append the pages in this hole to the highest zone in the last + * node. + * The call to init_unavailable_range() is outside the ifdef to + * silence the compiler warining about zone_id set but not used; + * for FLATMEM it is a nop anyway + */ + end_pfn = round_up(end_pfn, PAGES_PER_SECTION); + if (hole_pfn < end_pfn) +#endif + init_unavailable_range(hole_pfn, end_pfn, zone_id, nid); +} + +void __init *memmap_alloc(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, int nid, bool exact_nid) +{ + void *ptr; + + if (exact_nid) + ptr = memblock_alloc_exact_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + else + ptr = memblock_alloc_try_nid_raw(size, align, min_addr, + MEMBLOCK_ALLOC_ACCESSIBLE, + nid); + + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + +static int zone_batchsize(struct zone *zone) +{ +#ifdef CONFIG_MMU + int batch; + + /* + * The number of pages to batch allocate is either ~0.1% + * of the zone or 1MB, whichever is smaller. The batch + * size is striking a balance between allocation latency + * and zone lock contention. + */ + batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = rounddown_pow_of_two(batch + batch/2) - 1; + + return batch; + +#else + /* The deferral and batching of frees should be suppressed under NOMMU + * conditions. + * + * The problem is that NOMMU needs to be able to allocate large chunks + * of contiguous memory as there's no hardware page translation to + * assemble apparent contiguous memory from discontiguous pages. + * + * Queueing large contiguous runs of pages for batching, however, + * causes the pages to actually be freed in smaller chunks. As there + * can be a significant delay between the individual batches being + * recycled, this leads to the once large chunks of space being + * fragmented and becoming unavailable for high-order allocations. + */ + return 0; +#endif +} + +static int zone_highsize(struct zone *zone, int batch, int cpu_online) +{ +#ifdef CONFIG_MMU + int high; + int nr_split_cpus; + unsigned long total_pages; + + if (!percpu_pagelist_high_fraction) { + /* + * By default, the high value of the pcp is based on the zone + * low watermark so that if they are full then background + * reclaim will not be started prematurely. + */ + total_pages = low_wmark_pages(zone); + } else { + /* + * If percpu_pagelist_high_fraction is configured, the high + * value is based on a fraction of the managed pages in the + * zone. + */ + total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; + } + + /* + * Split the high value across all online CPUs local to the zone. Note + * that early in boot that CPUs may not be online yet and that during + * CPU hotplug that the cpumask is not yet updated when a CPU is being + * onlined. For memory nodes that have no CPUs, split pcp->high across + * all online CPUs to mitigate the risk that reclaim is triggered + * prematurely due to pages stored on pcp lists. + */ + nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; + if (!nr_split_cpus) + nr_split_cpus = num_online_cpus(); + high = total_pages / nr_split_cpus; + + /* + * Ensure high is at least batch*4. The multiple is based on the + * historical relationship between high and batch. + */ + high = max(high, batch << 2); + + return high; +#else + return 0; +#endif +} + +/* + * pcp->high and pcp->batch values are related and generally batch is lower + * than high. They are also related to pcp->count such that count is lower + * than high, and as soon as it reaches high, the pcplist is flushed. + * + * However, guaranteeing these relations at all times would require e.g. write + * barriers here but also careful usage of read barriers at the read side, and + * thus be prone to error and bad for performance. Thus the update only prevents + * store tearing. Any new users of pcp->batch and pcp->high should ensure they + * can cope with those fields changing asynchronously, and fully trust only the + * pcp->count field on the local CPU with interrupts disabled. + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + WRITE_ONCE(pcp->batch, batch); + WRITE_ONCE(pcp->high, high); +} + +static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) +{ + int pindex; + + memset(pcp, 0, sizeof(*pcp)); + memset(pzstats, 0, sizeof(*pzstats)); + + spin_lock_init(&pcp->lock); + for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) + INIT_LIST_HEAD(&pcp->lists[pindex]); + + /* + * Set batch and high values safe for a boot pageset. A true percpu + * pageset's initialization will update them subsequently. Here we don't + * need to be as careful as pageset_update() as nobody can access the + * pageset yet. + */ + pcp->high = BOOT_PAGESET_HIGH; + pcp->batch = BOOT_PAGESET_BATCH; + pcp->free_factor = 0; +} + +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, + unsigned long batch) +{ + struct per_cpu_pages *pcp; + int cpu; + + for_each_possible_cpu(cpu) { + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pageset_update(pcp, high, batch); + } +} + +/* + * Calculate and set new high and batch values for all per-cpu pagesets of a + * zone based on the zone's size. + */ +static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) +{ + int new_high, new_batch; + + new_batch = max(1, zone_batchsize(zone)); + new_high = zone_highsize(zone, new_batch, cpu_online); + + if (zone->pageset_high == new_high && + zone->pageset_batch == new_batch) + return; + + zone->pageset_high = new_high; + zone->pageset_batch = new_batch; + + __zone_set_pageset_high_and_batch(zone, new_high, new_batch); +} + +void __meminit setup_zone_pageset(struct zone *zone) +{ + int cpu; + + /* Size may be 0 on !SMP && !NUMA */ + if (sizeof(struct per_cpu_zonestat) > 0) + zone->per_cpu_zonestats = alloc_percpu(struct per_cpu_zonestat); + + zone->per_cpu_pageset = alloc_percpu(struct per_cpu_pages); + for_each_possible_cpu(cpu) { + struct per_cpu_pages *pcp; + struct per_cpu_zonestat *pzstats; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + per_cpu_pages_init(pcp, pzstats); + } + + zone_set_pageset_high_and_batch(zone, 0); +} + +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalculated. + */ +static void zone_pcp_update(struct zone *zone, int cpu_online) +{ + mutex_lock(&pcp_batch_high_lock); + zone_set_pageset_high_and_batch(zone, cpu_online); + mutex_unlock(&pcp_batch_high_lock); +} + +/* + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + */ +void __init setup_per_cpu_pageset(void) +{ + struct pglist_data *pgdat; + struct zone *zone; + int __maybe_unused cpu; + + for_each_populated_zone(zone) + setup_zone_pageset(zone); + +#ifdef CONFIG_NUMA + /* + * Unpopulated zones continue using the boot pagesets. + * The numa stats for these pagesets need to be reset. + * Otherwise, they will end up skewing the stats of + * the nodes these zones are associated with. + */ + for_each_possible_cpu(cpu) { + struct per_cpu_zonestat *pzstats = &per_cpu(boot_zonestats, cpu); + memset(pzstats->vm_numa_event, 0, + sizeof(pzstats->vm_numa_event)); + } +#endif + + for_each_online_pgdat(pgdat) + pgdat->per_cpu_nodestats = + alloc_percpu(struct per_cpu_nodestat); +} + +static __meminit void zone_pcp_init(struct zone *zone) +{ + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->per_cpu_pageset = &boot_pageset; + zone->per_cpu_zonestats = &boot_zonestats; + zone->pageset_high = BOOT_PAGESET_HIGH; + zone->pageset_batch = BOOT_PAGESET_BATCH; + + if (populated_zone(zone)) + pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, + zone->present_pages, zone_batchsize(zone)); +} + +void __meminit init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int zone_idx = zone_idx(zone) + 1; + + if (zone_idx > pgdat->nr_zones) + pgdat->nr_zones = zone_idx; + + zone->zone_start_pfn = zone_start_pfn; + + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "Initialising map node %d zone %lu pfns %lu -> %lu\n", + pgdat->node_id, + (unsigned long)zone_idx(zone), + zone_start_pfn, (zone_start_pfn + size)); + + zone_init_free_lists(zone); + zone->initialized = 1; +} + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. + * @start_pfn: Passed by reference. On return, it will have the node start_pfn. + * @end_pfn: Passed by reference. On return, it will have the node end_pfn. + * + * It returns the start and end page frame of a node based on information + * provided by memblock_set_node(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0. + */ +void __init get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + unsigned long this_start_pfn, this_end_pfn; + int i; + + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { + *start_pfn = min(*start_pfn, this_start_pfn); + *end_pfn = max(*end_pfn, this_end_pfn); + } + + if (*start_pfn == -1UL) + *start_pfn = 0; +} + +/* + * This finds a zone that can be used for ZONE_MOVABLE pages. The + * assumption is made that zones within a node are ordered in monotonic + * increasing memory addresses so that the "highest" populated zone is used + */ +static void __init find_usable_zone_for_movable(void) +{ + int zone_index; + for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { + if (zone_index == ZONE_MOVABLE) + continue; + + if (arch_zone_highest_possible_pfn[zone_index] > + arch_zone_lowest_possible_pfn[zone_index]) + break; + } + + VM_BUG_ON(zone_index == -1); + movable_zone = zone_index; +} + +/* + * The zone ranges provided by the architecture do not include ZONE_MOVABLE + * because it is sized independent of architecture. Unlike the other zones, + * the starting point for ZONE_MOVABLE is not fixed. It may be different + * in each node depending on the size of each node and how evenly kernelcore + * is distributed. This helper function adjusts the zone ranges + * provided by the architecture for a given node by using the end of the + * highest usable zone for ZONE_MOVABLE. This preserves the assumption that + * zones within a node are in order of monotonic increases memory addresses + */ +static void __init adjust_zone_range_for_zone_movable(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn) +{ + /* Only adjust if ZONE_MOVABLE is on this node */ + if (zone_movable_pfn[nid]) { + /* Size ZONE_MOVABLE */ + if (zone_type == ZONE_MOVABLE) { + *zone_start_pfn = zone_movable_pfn[nid]; + *zone_end_pfn = min(node_end_pfn, + arch_zone_highest_possible_pfn[movable_zone]); + + /* Adjust for ZONE_MOVABLE starting within this range */ + } else if (!mirrored_kernelcore && + *zone_start_pfn < zone_movable_pfn[nid] && + *zone_end_pfn > zone_movable_pfn[nid]) { + *zone_end_pfn = zone_movable_pfn[nid]; + + /* Check if this whole range is within ZONE_MOVABLE */ + } else if (*zone_start_pfn >= zone_movable_pfn[nid]) + *zone_start_pfn = *zone_end_pfn; + } +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +static unsigned long __init zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn) +{ + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + + /* Get the start and end of the zone */ + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + zone_start_pfn, zone_end_pfn); + + /* Check that this node has pages within the zone's required range */ + if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); + *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return *zone_end_pfn - *zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for. + */ +unsigned long __init __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + unsigned long nr_absent = range_end_pfn - range_start_pfn; + unsigned long start_pfn, end_pfn; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + nr_absent -= end_pfn - start_pfn; + } + return nr_absent; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * Return: the number of pages frames in memory holes within a range. + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +static unsigned long __init zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn) +{ + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; + unsigned long zone_start_pfn, zone_end_pfn; + unsigned long nr_absent; + + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); + + adjust_zone_range_for_zone_movable(nid, zone_type, + node_start_pfn, node_end_pfn, + &zone_start_pfn, &zone_end_pfn); + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + + /* + * ZONE_MOVABLE handling. + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages + * and vice versa. + */ + if (mirrored_kernelcore && zone_movable_pfn[nid]) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_mem_region(r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + } + } + + return nr_absent; +} + +static void __init calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long node_start_pfn, + unsigned long node_end_pfn) +{ + unsigned long realtotalpages = 0, totalpages = 0; + enum zone_type i; + + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long zone_start_pfn, zone_end_pfn; + unsigned long spanned, absent; + unsigned long size, real_size; + + spanned = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + &zone_start_pfn, + &zone_end_pfn); + absent = zone_absent_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn); + + size = spanned; + real_size = size - absent; + + if (size) + zone->zone_start_pfn = zone_start_pfn; + else + zone->zone_start_pfn = 0; + zone->spanned_pages = size; + zone->present_pages = real_size; +#if defined(CONFIG_MEMORY_HOTPLUG) + zone->present_early_pages = real_size; +#endif + + totalpages += size; + realtotalpages += real_size; + } + + pgdat->node_spanned_pages = totalpages; + pgdat->node_present_pages = realtotalpages; + pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + +#ifndef CONFIG_SPARSEMEM +/* + * Calculate the size of the zone->blockflags rounded to an unsigned long + * Start by making sure zonesize is a multiple of pageblock_order by rounding + * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally + * round what is now in bits to nearest long in bits, then return it in + * bytes. + */ +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) +{ + unsigned long usemapsize; + + zonesize += zone_start_pfn & (pageblock_nr_pages-1); + usemapsize = roundup(zonesize, pageblock_nr_pages); + usemapsize = usemapsize >> pageblock_order; + usemapsize *= NR_PAGEBLOCK_BITS; + usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + + return usemapsize / 8; +} + +static void __ref setup_usemap(struct zone *zone) +{ + unsigned long usemapsize = usemap_size(zone->zone_start_pfn, + zone->spanned_pages); + zone->pageblock_flags = NULL; + if (usemapsize) { + zone->pageblock_flags = + memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, + zone_to_nid(zone)); + if (!zone->pageblock_flags) + panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", + usemapsize, zone->name, zone_to_nid(zone)); + } +} +#else +static inline void setup_usemap(struct zone *zone) {} +#endif /* CONFIG_SPARSEMEM */ + +#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE + +/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ +void __init set_pageblock_order(void) +{ + unsigned int order = MAX_ORDER - 1; + + /* Check that pageblock_nr_pages has not already been setup */ + if (pageblock_order) + return; + + /* Don't let pageblocks exceed the maximum allocation granularity. */ + if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) + order = HUGETLB_PAGE_ORDER; + + /* + * Assume the largest contiguous order of interest is a huge page. + * This value may be variable depending on boot parameters on IA64 and + * powerpc. + */ + pageblock_order = order; +} +#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +/* + * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() + * is unused as pageblock_order is set at compile-time. See + * include/linux/pageblock-flags.h for the values of pageblock_order based on + * the kernel config + */ +void __init set_pageblock_order(void) +{ +} + +#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ + +static unsigned long __init calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) +{ + unsigned long pages = spanned_pages; + + /* + * Provide a more accurate estimation if there are holes within + * the zone and SPARSEMEM is in use. If there are holes within the + * zone, each populated memory region may cost us one or two extra + * memmap pages due to alignment because memmap pages for each + * populated regions may not be naturally aligned on page boundary. + * So the (present_pages >> 4) heuristic is a tradeoff for that. + */ + if (spanned_pages > present_pages + (present_pages >> 4) && + IS_ENABLED(CONFIG_SPARSEMEM)) + pages = present_pages; + + return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pgdat_init_split_queue(struct pglist_data *pgdat) +{ + struct deferred_split *ds_queue = &pgdat->deferred_split_queue; + + spin_lock_init(&ds_queue->split_queue_lock); + INIT_LIST_HEAD(&ds_queue->split_queue); + ds_queue->split_queue_len = 0; +} +#else +static void pgdat_init_split_queue(struct pglist_data *pgdat) {} +#endif + +#ifdef CONFIG_COMPACTION +static void pgdat_init_kcompactd(struct pglist_data *pgdat) +{ + init_waitqueue_head(&pgdat->kcompactd_wait); +} +#else +static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} +#endif + +static void __meminit pgdat_init_internals(struct pglist_data *pgdat) +{ + int i; + + pgdat_resize_init(pgdat); + pgdat_kswapd_lock_init(pgdat); + + pgdat_init_split_queue(pgdat); + pgdat_init_kcompactd(pgdat); + + init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); + + for (i = 0; i < NR_VMSCAN_THROTTLE; i++) + init_waitqueue_head(&pgdat->reclaim_wait[i]); + + pgdat_page_ext_init(pgdat); + lruvec_init(&pgdat->__lruvec); +} + +static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, + unsigned long remaining_pages) +{ + atomic_long_set(&zone->managed_pages, remaining_pages); + zone_set_nid(zone, nid); + zone->name = zone_names[idx]; + zone->zone_pgdat = NODE_DATA(nid); + spin_lock_init(&zone->lock); + zone_seqlock_init(zone); + zone_pcp_init(zone); +} + +/* + * Set up the zone data structures + * - init pgdat internals + * - init all zones belonging to this node + * + * NOTE: this function is only called during memory hotplug + */ +#ifdef CONFIG_MEMORY_HOTPLUG +void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) +{ + int nid = pgdat->node_id; + enum zone_type z; + int cpu; + + pgdat_init_internals(pgdat); + + if (pgdat->per_cpu_nodestats == &boot_nodestats) + pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); + + /* + * Reset the nr_zones, order and highest_zoneidx before reuse. + * Note that kswapd will init kswapd_highest_zoneidx properly + * when it starts in the near future. + */ + pgdat->nr_zones = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_highest_zoneidx = 0; + pgdat->node_start_pfn = 0; + for_each_online_cpu(cpu) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + memset(p, 0, sizeof(*p)); + } + + for (z = 0; z < MAX_NR_ZONES; z++) + zone_init_internals(&pgdat->node_zones[z], z, nid, 0); +} +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + * NOTE: this function is only called during early init. + */ +static void __init free_area_init_core(struct pglist_data *pgdat) +{ + enum zone_type j; + int nid = pgdat->node_id; + + pgdat_init_internals(pgdat); + pgdat->per_cpu_nodestats = &boot_nodestats; + + for (j = 0; j < MAX_NR_ZONES; j++) { + struct zone *zone = pgdat->node_zones + j; + unsigned long size, freesize, memmap_pages; + + size = zone->spanned_pages; + freesize = zone->present_pages; + + /* + * Adjust freesize so that it accounts for how much memory + * is used by this zone for memmap. This affects the watermark + * and per-cpu initialisations + */ + memmap_pages = calc_memmap_size(size, freesize); + if (!is_highmem_idx(j)) { + if (freesize >= memmap_pages) { + freesize -= memmap_pages; + if (memmap_pages) + pr_debug(" %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); + } + + /* Account for reserved pages */ + if (j == 0 && freesize > dma_reserve) { + freesize -= dma_reserve; + pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); + } + + if (!is_highmem_idx(j)) + nr_kernel_pages += freesize; + /* Charge for highmem memmap if there are enough kernel pages */ + else if (nr_kernel_pages > memmap_pages * 2) + nr_kernel_pages -= memmap_pages; + nr_all_pages += freesize; + + /* + * Set an approximate value for lowmem here, it will be adjusted + * when the bootmem allocator frees pages into the buddy system. + * And all highmem pages will be managed by the buddy system. + */ + zone_init_internals(zone, j, nid, freesize); + + if (!size) + continue; + + set_pageblock_order(); + setup_usemap(zone); + init_currently_empty_zone(zone, zone->zone_start_pfn, size); + } +} + +#ifdef CONFIG_FLATMEM +static void __init alloc_node_mem_map(struct pglist_data *pgdat) +{ + unsigned long __maybe_unused start = 0; + unsigned long __maybe_unused offset = 0; + + /* Skip empty nodes */ + if (!pgdat->node_spanned_pages) + return; + + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + offset = pgdat->node_start_pfn - start; + /* ia64 gets its own node_mem_map, before this, without bootmem */ + if (!pgdat->node_mem_map) { + unsigned long size, end; + struct page *map; + + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + end = pgdat_end_pfn(pgdat); + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); + map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, + pgdat->node_id, false); + if (!map) + panic("Failed to allocate %ld bytes for node %d memory map\n", + size, pgdat->node_id); + pgdat->node_mem_map = map + offset; + } + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); +#ifndef CONFIG_NUMA + /* + * With no DISCONTIG, the global mem_map is just set as node 0's + */ + if (pgdat == NODE_DATA(0)) { + mem_map = NODE_DATA(0)->node_mem_map; + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= offset; + } +#endif +} +#else +static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } +#endif /* CONFIG_FLATMEM */ + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} +#else +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} +#endif + +static void __init free_area_init_node(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; + + /* pg_data_t should be reset to zero when it's allocated */ + WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + + pgdat->node_id = nid; + pgdat->node_start_pfn = start_pfn; + pgdat->per_cpu_nodestats = NULL; + + if (start_pfn != end_pfn) { + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); + } else { + pr_info("Initmem setup node %d as memoryless\n", nid); + } + + calculate_node_totalpages(pgdat, start_pfn, end_pfn); + + alloc_node_mem_map(pgdat); + pgdat_set_deferred_range(pgdat); + + free_area_init_core(pgdat); +} + +static void __init free_area_init_memoryless_node(int nid) +{ + free_area_init_node(nid); +} + +#if MAX_NUMNODES > 1 +/* + * Figure out the number of possible node ids. + */ +void __init setup_nr_node_ids(void) +{ + unsigned int highest; + + highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); + nr_node_ids = highest + 1; +} +#endif + +/** + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the + * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map. + * + * Return: the determined alignment in pfn's. 0 if there is no alignment + * requirement (single node). + */ +unsigned long __init node_map_pfn_alignment(void) +{ + unsigned long accl_mask = 0, last_end = 0; + unsigned long start, end, mask; + int last_nid = NUMA_NO_NODE; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { + if (!start || last_nid < 0 || last_nid == nid) { + last_nid = nid; + last_end = end; + continue; + } + + /* + * Start with a mask granular enough to pin-point to the + * start pfn and tick off bits one-by-one until it becomes + * too coarse to separate the current node from the last. + */ + mask = ~((1 << __ffs(start)) - 1); + while (mask && last_end <= (start & (mask << 1))) + mask <<= 1; + + /* accumulate all internode masks */ + accl_mask |= mask; + } + + /* convert mask to number of pages */ + return ~accl_mask + 1; +} + +/* + * early_calculate_totalpages() + * Sum pages in active regions for movable zone. + * Populate N_MEMORY for calculating usable_nodes. + */ +static unsigned long __init early_calculate_totalpages(void) +{ + unsigned long totalpages = 0; + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + unsigned long pages = end_pfn - start_pfn; + + totalpages += pages; + if (pages) + node_set_state(nid, N_MEMORY); + } + return totalpages; +} + +/* + * Find the PFN the Movable zone begins in each node. Kernel memory + * is spread evenly between nodes as long as the nodes have enough + * memory. When they don't, some nodes will have more kernelcore than + * others + */ +static void __init find_zone_movable_pfns_for_nodes(void) +{ + int i, nid; + unsigned long usable_startpfn; + unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_MEMORY]; + unsigned long totalpages = early_calculate_totalpages(); + int usable_nodes = nodes_weight(node_states[N_MEMORY]); + struct memblock_region *r; + + /* Need to find movable_zone earlier when movable_node is specified. */ + find_usable_zone_for_movable(); + + /* + * If movable_node is specified, ignore kernelcore and movablecore + * options. + */ + if (movable_node_is_enabled()) { + for_each_mem_region(r) { + if (!memblock_is_hotpluggable(r)) + continue; + + nid = memblock_get_region_node(r); + + usable_startpfn = PFN_DOWN(r->base); + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + goto out2; + } + + /* + * If kernelcore=mirror is specified, ignore movablecore option + */ + if (mirrored_kernelcore) { + bool mem_below_4gb_not_mirrored = false; + + for_each_mem_region(r) { + if (memblock_is_mirror(r)) + continue; + + nid = memblock_get_region_node(r); + + usable_startpfn = memblock_region_memory_base_pfn(r); + + if (usable_startpfn < PHYS_PFN(SZ_4G)) { + mem_below_4gb_not_mirrored = true; + continue; + } + + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + if (mem_below_4gb_not_mirrored) + pr_warn("This configuration results in unmirrored kernel memory.\n"); + + goto out2; + } + + /* + * If kernelcore=nn% or movablecore=nn% was specified, calculate the + * amount of necessary memory. + */ + if (required_kernelcore_percent) + required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / + 10000UL; + if (required_movablecore_percent) + required_movablecore = (totalpages * 100 * required_movablecore_percent) / + 10000UL; + + /* + * If movablecore= was specified, calculate what size of + * kernelcore that corresponds so that memory usable for + * any allocation type is evenly spread. If both kernelcore + * and movablecore are specified, then the value of kernelcore + * will be used for required_kernelcore if it's greater than + * what movablecore would have allowed. + */ + if (required_movablecore) { + unsigned long corepages; + + /* + * Round-up so that ZONE_MOVABLE is at least as large as what + * was requested by the user + */ + required_movablecore = + roundup(required_movablecore, MAX_ORDER_NR_PAGES); + required_movablecore = min(totalpages, required_movablecore); + corepages = totalpages - required_movablecore; + + required_kernelcore = max(required_kernelcore, corepages); + } + + /* + * If kernelcore was not specified or kernelcore size is larger + * than totalpages, there is no ZONE_MOVABLE. + */ + if (!required_kernelcore || required_kernelcore >= totalpages) + goto out; + + /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ + usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; + +restart: + /* Spread kernelcore memory as evenly as possible throughout nodes */ + kernelcore_node = required_kernelcore / usable_nodes; + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + /* + * Recalculate kernelcore_node if the division per node + * now exceeds what is necessary to satisfy the requested + * amount of memory for the kernel + */ + if (required_kernelcore < kernelcore_node) + kernelcore_node = required_kernelcore / usable_nodes; + + /* + * As the map is walked, we track how much memory is usable + * by the kernel using kernelcore_remaining. When it is + * 0, the rest of the node is usable by ZONE_MOVABLE + */ + kernelcore_remaining = kernelcore_node; + + /* Go through each range of PFNs within this node */ + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + unsigned long size_pages; + + start_pfn = max(start_pfn, zone_movable_pfn[nid]); + if (start_pfn >= end_pfn) + continue; + + /* Account for what is only usable for kernelcore */ + if (start_pfn < usable_startpfn) { + unsigned long kernel_pages; + kernel_pages = min(end_pfn, usable_startpfn) + - start_pfn; + + kernelcore_remaining -= min(kernel_pages, + kernelcore_remaining); + required_kernelcore -= min(kernel_pages, + required_kernelcore); + + /* Continue if range is now fully accounted */ + if (end_pfn <= usable_startpfn) { + + /* + * Push zone_movable_pfn to the end so + * that if we have to rebalance + * kernelcore across nodes, we will + * not double account here + */ + zone_movable_pfn[nid] = end_pfn; + continue; + } + start_pfn = usable_startpfn; + } + + /* + * The usable PFN range for ZONE_MOVABLE is from + * start_pfn->end_pfn. Calculate size_pages as the + * number of pages used as kernelcore + */ + size_pages = end_pfn - start_pfn; + if (size_pages > kernelcore_remaining) + size_pages = kernelcore_remaining; + zone_movable_pfn[nid] = start_pfn + size_pages; + + /* + * Some kernelcore has been met, update counts and + * break if the kernelcore for this node has been + * satisfied + */ + required_kernelcore -= min(required_kernelcore, + size_pages); + kernelcore_remaining -= size_pages; + if (!kernelcore_remaining) + break; + } + } + + /* + * If there is still required_kernelcore, we do another pass with one + * less node in the count. This will push zone_movable_pfn[nid] further + * along on the nodes that still have memory until kernelcore is + * satisfied + */ + usable_nodes--; + if (usable_nodes && required_kernelcore > usable_nodes) + goto restart; + +out2: + /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ + for (nid = 0; nid < MAX_NUMNODES; nid++) { + unsigned long start_pfn, end_pfn; + + zone_movable_pfn[nid] = + roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + if (zone_movable_pfn[nid] >= end_pfn) + zone_movable_pfn[nid] = 0; + } + +out: + /* restore the node_state */ + node_states[N_MEMORY] = saved_node_state; +} + +/* Any regular or high memory on that node ? */ +static void check_for_memory(pg_data_t *pgdat, int nid) +{ + enum zone_type zone_type; + + for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + if (populated_zone(zone)) { + if (IS_ENABLED(CONFIG_HIGHMEM)) + node_set_state(nid, N_HIGH_MEMORY); + if (zone_type <= ZONE_NORMAL) + node_set_state(nid, N_NORMAL_MEMORY); + break; + } + } +} + +/* + * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * such cases we allow max_zone_pfn sorted in the descending order + */ +bool __weak arch_has_descending_max_zone_pfns(void) +{ + return false; +} + +/** + * free_area_init - Initialise all pg_data_t and zone data + * @max_zone_pfn: an array of max PFNs for each zone + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by memblock_set_node(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init(unsigned long *max_zone_pfn) +{ + unsigned long start_pfn, end_pfn; + int i, nid, zone; + bool descending; + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + + start_pfn = PHYS_PFN(memblock_start_of_DRAM()); + descending = arch_has_descending_max_zone_pfns(); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (descending) + zone = MAX_NR_ZONES - i - 1; + else + zone = i; + + if (zone == ZONE_MOVABLE) + continue; + + end_pfn = max(max_zone_pfn[zone], start_pfn); + arch_zone_lowest_possible_pfn[zone] = start_pfn; + arch_zone_highest_possible_pfn[zone] = end_pfn; + + start_pfn = end_pfn; + } + + /* Find the PFNs that ZONE_MOVABLE begins at in each node */ + memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); + find_zone_movable_pfns_for_nodes(); + + /* Print out the zone ranges */ + pr_info("Zone ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) { + if (i == ZONE_MOVABLE) + continue; + pr_info(" %-8s ", zone_names[i]); + if (arch_zone_lowest_possible_pfn[i] == + arch_zone_highest_possible_pfn[i]) + pr_cont("empty\n"); + else + pr_cont("[mem %#018Lx-%#018Lx]\n", + (u64)arch_zone_lowest_possible_pfn[i] + << PAGE_SHIFT, + ((u64)arch_zone_highest_possible_pfn[i] + << PAGE_SHIFT) - 1); + } + + /* Print out the PFNs ZONE_MOVABLE begins at in each node */ + pr_info("Movable zone start for each node\n"); + for (i = 0; i < MAX_NUMNODES; i++) { + if (zone_movable_pfn[i]) + pr_info(" Node %d: %#018Lx\n", i, + (u64)zone_movable_pfn[i] << PAGE_SHIFT); + } + + /* + * Print out the early node map, and initialize the + * subsection-map relative to active online memory ranges to + * enable future "sub-section" extensions of the memory map. + */ + pr_info("Early memory node ranges\n"); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + ((u64)end_pfn << PAGE_SHIFT) - 1); + subsection_map_init(start_pfn, end_pfn - start_pfn); + } + + /* Initialise every node */ + mminit_verify_pageflags_layout(); + setup_nr_node_ids(); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + arch_refresh_nodedata(nid, pgdat); + free_area_init_memoryless_node(nid); + + /* + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. + */ + continue; + } + + pgdat = NODE_DATA(nid); + free_area_init_node(nid); + + /* Any memory on that node */ + if (pgdat->node_present_pages) + node_set_state(nid, N_MEMORY); + check_for_memory(pgdat, nid); + } + + memmap_init(); +} + +static int __init cmdline_parse_core(char *p, unsigned long *core, + unsigned long *percent) +{ + unsigned long long coremem; + char *endptr; + + if (!p) + return -EINVAL; + + /* Value may be a percentage of total memory, otherwise bytes */ + coremem = simple_strtoull(p, &endptr, 0); + if (*endptr == '%') { + /* Paranoid check for percent values greater than 100 */ + WARN_ON(coremem > 100); + + *percent = coremem; + } else { + coremem = memparse(p, &p); + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + + *core = coremem >> PAGE_SHIFT; + *percent = 0UL; + } + return 0; +} + +/* + * kernelcore=size sets the amount of memory for use for allocations that + * cannot be reclaimed or migrated. + */ +static int __init cmdline_parse_kernelcore(char *p) +{ + /* parse kernelcore=mirror */ + if (parse_option_str(p, "mirror")) { + mirrored_kernelcore = true; + return 0; + } + + return cmdline_parse_core(p, &required_kernelcore, + &required_kernelcore_percent); +} + +/* + * movablecore=size sets the amount of memory for use for allocations that + * can be reclaimed or migrated. + */ +static int __init cmdline_parse_movablecore(char *p) +{ + return cmdline_parse_core(p, &required_movablecore, + &required_movablecore_percent); +} + +early_param("kernelcore", cmdline_parse_kernelcore); +early_param("movablecore", cmdline_parse_movablecore); + +void adjust_managed_page_count(struct page *page, long count) +{ + atomic_long_add(count, &page_zone(page)->managed_pages); + totalram_pages_add(count); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages_add(count); +#endif +} +EXPORT_SYMBOL(adjust_managed_page_count); + +unsigned long free_reserved_area(void *start, void *end, int poison, const char *s) +{ + void *pos; + unsigned long pages = 0; + + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + struct page *page = virt_to_page(pos); + void *direct_map_addr; + + /* + * 'direct_map_addr' might be different from 'pos' + * because some architectures' virt_to_page() + * work with aliases. Getting the direct map + * address ensures that we get a _writeable_ + * alias for the memset(). + */ + direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); + if ((unsigned int)poison <= 0xFF) + memset(direct_map_addr, poison, PAGE_SIZE); + + free_reserved_page(page); + } + + if (pages && s) + pr_info("Freeing %s memory: %ldK\n", s, K(pages)); + + return pages; +} + +void __init mem_init_print_info(void) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + do { \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ + size -= adj; \ + } while (0) + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + ")\n", + K(nr_free_pages()), K(physpages), + codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, + (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, + K(physpages - totalram_pages() - totalcma_pages), + K(totalcma_pages) +#ifdef CONFIG_HIGHMEM + , K(totalhigh_pages()) +#endif + ); +} + +/** + * set_dma_reserve - set the specified number of pages reserved in the first zone + * @new_dma_reserve: The number of pages to mark reserved + * + * The per-cpu batchsize and zone watermarks are determined by managed_pages. + * In the DMA zone, a significant percentage may be consumed by kernel image + * and other unfreeable allocations which can skew the watermarks badly. This + * function may optionally be used to account for unfreeable pages in the + * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and + * smaller per-cpu batchsize. + */ +void __init set_dma_reserve(unsigned long new_dma_reserve) +{ + dma_reserve = new_dma_reserve; +} + +static int page_alloc_cpu_dead(unsigned int cpu) +{ + struct zone *zone; + + lru_add_drain_cpu(cpu); + mlock_page_drain_remote(cpu); + drain_pages(cpu); + + /* + * Spill the event counters of the dead processor + * into the current processors event counters. + * This artificially elevates the count of the current + * processor. + */ + vm_events_fold_cpu(cpu); + + /* + * Zero the differential counters of the dead processor + * so that the vm statistics are consistent. + * + * This is only okay since the processor is dead and cannot + * race with what we are doing. + */ + cpu_vm_stats_fold(cpu); + + for_each_populated_zone(zone) + zone_pcp_update(zone, 0); + + return 0; +} + +static int page_alloc_cpu_online(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zone_pcp_update(zone, 1); + return 0; +} + +#ifdef CONFIG_NUMA +int hashdist = HASHDIST_DEFAULT; + +static int __init set_hashdist(char *str) +{ + if (!str) + return 0; + hashdist = simple_strtoul(str, &str, 0); + return 1; +} +__setup("hashdist=", set_hashdist); +#endif + +void __init page_alloc_init(void) +{ + int ret; + +#ifdef CONFIG_NUMA + if (num_node_state(N_MEMORY) == 1) + hashdist = 0; +#endif + + ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC, + "mm/page_alloc:pcp", + page_alloc_cpu_online, + page_alloc_cpu_dead); + WARN_ON(ret < 0); +} + +/* + * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + enum zone_type i, j; + + for_each_online_pgdat(pgdat) { + + pgdat->totalreserve_pages = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + long max = 0; + unsigned long managed_pages = zone_managed_pages(zone); + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat the high watermark as reserved pages. */ + max += high_wmark_pages(zone); + + if (max > managed_pages) + max = managed_pages; + + pgdat->totalreserve_pages += max; + + reserve_pages += max; + } + } + totalreserve_pages = reserve_pages; +} + +/* + * setup_per_zone_lowmem_reserve - called whenever + * sysctl_lowmem_reserve_ratio changes. Ensures that each zone + * has a correct pages reserved value, so an adequate number of + * pages are left in the zone after a successful __alloc_pages(). + */ +static void setup_per_zone_lowmem_reserve(void) +{ + struct pglist_data *pgdat; + enum zone_type i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES - 1; i++) { + struct zone *zone = &pgdat->node_zones[i]; + int ratio = sysctl_lowmem_reserve_ratio[i]; + bool clear = !ratio || !zone_managed_pages(zone); + unsigned long managed_pages = 0; + + for (j = i + 1; j < MAX_NR_ZONES; j++) { + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); + + if (clear) + zone->lowmem_reserve[j] = 0; + else + zone->lowmem_reserve[j] = managed_pages / ratio; + } + } + } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); +} + +static void __setup_per_zone_wmarks(void) +{ + unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned long lowmem_pages = 0; + struct zone *zone; + unsigned long flags; + + /* Calculate total number of !ZONE_HIGHMEM pages */ + for_each_zone(zone) { + if (!is_highmem(zone)) + lowmem_pages += zone_managed_pages(zone); + } + + for_each_zone(zone) { + u64 tmp; + + spin_lock_irqsave(&zone->lock, flags); + tmp = (u64)pages_min * zone_managed_pages(zone); + do_div(tmp, lowmem_pages); + if (is_highmem(zone)) { + /* + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) + * deltas control async page reclaim, and so should + * not be capped for highmem. + */ + unsigned long min_pages; + + min_pages = zone_managed_pages(zone) / 1024; + min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); + zone->_watermark[WMARK_MIN] = min_pages; + } else { + /* + * If it's a lowmem zone, reserve a number of pages + * proportionate to the zone's size. + */ + zone->_watermark[WMARK_MIN] = tmp; + } + + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone_managed_pages(zone), + watermark_scale_factor, 10000)); + + zone->watermark_boost = 0; + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp; + zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; + + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); +} + +/** + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} + * + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. + */ +void setup_per_zone_wmarks(void) +{ + struct zone *zone; + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); + __setup_per_zone_wmarks(); + spin_unlock(&lock); + + /* + * The watermark size have changed so update the pcpu batch + * and high limits or the limits may be inappropriate. + */ + for_each_zone(zone) + zone_pcp_update(zone, 0); +} + +/* + * Initialise min_free_kbytes. + * + * For small machines we want it small (128k min). For large machines + * we want it large (256MB max). But it is not linear, because network + * bandwidth does not increase linearly with machine size. We use + * + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = sqrt(lowmem_kbytes * 16) + * + * which yields + * + * 16MB: 512k + * 32MB: 724k + * 64MB: 1024k + * 128MB: 1448k + * 256MB: 2048k + * 512MB: 2896k + * 1024MB: 4096k + * 2048MB: 5792k + * 4096MB: 8192k + * 8192MB: 11584k + * 16384MB: 16384k + */ +void calculate_min_free_kbytes(void) +{ + unsigned long lowmem_kbytes; + int new_min_free_kbytes; + + lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) + min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144); + else + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + +} + +int __meminit init_per_zone_wmark_min(void) +{ + calculate_min_free_kbytes(); + setup_per_zone_wmarks(); + refresh_zone_stat_thresholds(); + setup_per_zone_lowmem_reserve(); + +#ifdef CONFIG_NUMA + setup_min_unmapped_ratio(); + setup_min_slab_ratio(); +#endif + + khugepaged_min_free_kbytes_update(); + + return 0; +} +postcore_initcall(init_per_zone_wmark_min) + +/* + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * that we can call two helper functions whenever min_free_kbytes + * changes. + */ +int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) { + user_min_free_kbytes = min_free_kbytes; + setup_per_zone_wmarks(); + } + return 0; +} + +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + +#ifdef CONFIG_NUMA +static void setup_min_unmapped_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_unmapped_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) * + sysctl_min_unmapped_ratio) / 100; +} + + +int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + setup_min_unmapped_ratio(); + + return 0; +} + +static void setup_min_slab_ratio(void) +{ + pg_data_t *pgdat; + struct zone *zone; + + for_each_online_pgdat(pgdat) + pgdat->min_slab_pages = 0; + + for_each_zone(zone) + zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) * + sysctl_min_slab_ratio) / 100; +} + +int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + setup_min_slab_ratio(); + + return 0; +} +#endif + +/* + * lowmem_reserve_ratio_sysctl_handler - just a wrapper around + * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() + * whenever sysctl_lowmem_reserve_ratio changes. + * + * The reserve ratio obviously has absolutely no relation with the + * minimum watermarks. The lowmem reserve ratio can only make sense + * if in function of the boot time zone sizes. + */ +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int i; + + proc_dointvec_minmax(table, write, buffer, length, ppos); + + for (i = 0; i < MAX_NR_ZONES; i++) { + if (sysctl_lowmem_reserve_ratio[i] < 1) + sysctl_lowmem_reserve_ratio[i] = 0; + } + + setup_per_zone_lowmem_reserve(); + return 0; +} + +/* + * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. + */ +int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table, + int write, void *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int old_percpu_pagelist_high_fraction; + int ret; + + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; + + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_high_fraction && + percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) { + percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction) + goto out; + + for_each_populated_zone(zone) + zone_set_pageset_high_and_batch(zone, 0); +out: + mutex_unlock(&pcp_batch_high_lock); + return ret; +} + +#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES +/* + * Returns the number of pages that arch has reserved but + * is not known to alloc_large_system_hash(). + */ +static unsigned long __init arch_reserved_kernel_pages(void) +{ + return 0; +} +#endif + +/* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + +/* + * allocate a large system hash table from bootmem + * - it is assumed that the hash table must contain an exact power-of-2 + * quantity of entries + * - limit is the number of hash buckets, not the total allocation size + */ +void *__init alloc_large_system_hash(const char *tablename, + unsigned long bucketsize, + unsigned long numentries, + int scale, + int flags, + unsigned int *_hash_shift, + unsigned int *_hash_mask, + unsigned long low_limit, + unsigned long high_limit) +{ + unsigned long long max = high_limit; + unsigned long log2qty, size; + void *table; + gfp_t gfp_flags; + bool virt; + bool huge; + + /* allow the kernel cmdline to have a say */ + if (!numentries) { + /* round applicable memory size up to nearest megabyte */ + numentries = nr_kernel_pages; + numentries -= arch_reserved_kernel_pages(); + + /* It isn't necessary when PAGE_SIZE >= 1MB */ + if (PAGE_SIZE < SZ_1M) + numentries = round_up(numentries, SZ_1M / PAGE_SIZE); + +#if __BITS_PER_LONG > 32 + if (!high_limit) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } +#endif + + /* limit to 1 bucket per 2^scale bytes of low memory */ + if (scale > PAGE_SHIFT) + numentries >>= (scale - PAGE_SHIFT); + else + numentries <<= (PAGE_SHIFT - scale); + + /* Make sure we've got at least a 0-order allocation.. */ + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + numentries = PAGE_SIZE / bucketsize; + } + numentries = roundup_pow_of_two(numentries); + + /* limit allocation size to 1/16 total memory by default */ + if (max == 0) { + max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; + do_div(max, bucketsize); + } + max = min(max, 0x80000000ULL); + + if (numentries < low_limit) + numentries = low_limit; + if (numentries > max) + numentries = max; + + log2qty = ilog2(numentries); + + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; + do { + virt = false; + size = bucketsize << log2qty; + if (flags & HASH_EARLY) { + if (flags & HASH_ZERO) + table = memblock_alloc(size, SMP_CACHE_BYTES); + else + table = memblock_alloc_raw(size, + SMP_CACHE_BYTES); + } else if (get_order(size) >= MAX_ORDER || hashdist) { + table = vmalloc_huge(size, gfp_flags); + virt = true; + if (table) + huge = is_vm_area_hugepages(table); + } else { + /* + * If bucketsize is not a power-of-two, we may free + * some pages at the end of hash table which + * alloc_pages_exact() automatically does + */ + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); + } + } while (!table && size > PAGE_SIZE && --log2qty); + + if (!table) + panic("Failed to allocate %s hash table\n", tablename); + + pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); + + if (_hash_shift) + *_hash_shift = log2qty; + if (_hash_mask) + *_hash_mask = (1 << log2qty) - 1; + + return table; +} + +#ifdef CONFIG_CONTIG_ALLOC +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) +/* Usage: See admin-guide/dynamic-debug-howto.rst */ +static void alloc_contig_dump_pages(struct list_head *page_list) +{ + DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, "migrate failure"); + + if (DYNAMIC_DEBUG_BRANCH(descriptor)) { + struct page *page; + + dump_stack(); + list_for_each_entry(page, page_list, lru) + dump_page(page, "migration failure"); + } +} +#else +static inline void alloc_contig_dump_pages(struct list_head *page_list) +{ +} +#endif + +/* [start, end) must belong to a single zone. */ +int __alloc_contig_migrate_range(struct compact_control *cc, + unsigned long start, unsigned long end) +{ + /* This function is based on compact_zone() from compaction.c. */ + unsigned int nr_reclaimed; + unsigned long pfn = start; + unsigned int tries = 0; + int ret = 0; + struct migration_target_control mtc = { + .nid = zone_to_nid(cc->zone), + .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, + }; + + lru_cache_disable(); + + while (pfn < end || !list_empty(&cc->migratepages)) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (list_empty(&cc->migratepages)) { + cc->nr_migratepages = 0; + ret = isolate_migratepages_range(cc, pfn, end); + if (ret && ret != -EAGAIN) + break; + pfn = cc->migrate_pfn; + tries = 0; + } else if (++tries == 5) { + ret = -EBUSY; + break; + } + + nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, + &cc->migratepages); + cc->nr_migratepages -= nr_reclaimed; + + ret = migrate_pages(&cc->migratepages, alloc_migration_target, + NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); + + /* + * On -ENOMEM, migrate_pages() bails out right away. It is pointless + * to retry again over this error, so do the same here. + */ + if (ret == -ENOMEM) + break; + } + + lru_cache_enable(); + if (ret < 0) { + if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) + alloc_contig_dump_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); + return ret; + } + return 0; +} + +/** + * alloc_contig_range() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlying pageblocks (either + * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks + * in range must have the same migratetype and it must + * be either of the two. + * @gfp_mask: GFP mask to use during compaction + * + * The PFN range does not have to be pageblock aligned. The PFN range must + * belong to a single zone. + * + * The first thing this routine does is attempt to MIGRATE_ISOLATE all + * pageblocks in the range. Once isolated, the pageblocks should not + * be modified by others. + * + * Return: zero on success or negative error code. On success all + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +int alloc_contig_range(unsigned long start, unsigned long end, + unsigned migratetype, gfp_t gfp_mask) +{ + unsigned long outer_start, outer_end; + int order; + int ret = 0; + + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(start)), + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .no_set_skip_hint = true, + .gfp_mask = current_gfp_context(gfp_mask), + .alloc_contig = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + /* + * What we do here is we mark all pageblocks in range as + * MIGRATE_ISOLATE. Because pageblock and max order pages may + * have different sizes, and due to the way page allocator + * work, start_isolate_page_range() has special handlings for this. + * + * Once the pageblocks are marked as MIGRATE_ISOLATE, we + * migrate the pages from an unaligned range (ie. pages that + * we are interested in). This will put all the pages in + * range back to page allocator as MIGRATE_ISOLATE. + * + * When this is done, we take the pages in range from page + * allocator removing them from the buddy system. This way + * page allocator will never consider using them. + * + * This lets us mark the pageblocks back as + * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the + * aligned range but not in the unaligned, original range are + * put back to page allocator so that buddy can use them. + */ + + ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask); + if (ret) + goto done; + + drain_all_pages(cc.zone); + + /* + * In case of -EBUSY, we'd like to know which page causes problem. + * So, just fall through. test_pages_isolated() has a tracepoint + * which will report the busy page. + * + * It is possible that busy pages could become available before + * the call to test_pages_isolated, and the range will actually be + * allocated. So, if we fall through be sure to clear ret so that + * -EBUSY is not accidentally used or returned to caller. + */ + ret = __alloc_contig_migrate_range(&cc, start, end); + if (ret && ret != -EBUSY) + goto done; + ret = 0; + + /* + * Pages from [start, end) are within a pageblock_nr_pages + * aligned blocks that are marked as MIGRATE_ISOLATE. What's + * more, all pages in [start, end) are free in page allocator. + * What we are going to do is to allocate all pages from + * [start, end) (that is remove them from page allocator). + * + * The only problem is that pages at the beginning and at the + * end of interesting range may be not aligned with pages that + * page allocator holds, ie. they can be part of higher order + * pages. Because of this, we reserve the bigger range and + * once this is done free the pages we are not interested in. + * + * We don't have to hold zone->lock here because the pages are + * isolated thus they won't get removed from buddy. + */ + + order = 0; + outer_start = start; + while (!PageBuddy(pfn_to_page(outer_start))) { + if (++order >= MAX_ORDER) { + outer_start = start; + break; + } + outer_start &= ~0UL << order; + } + + if (outer_start != start) { + order = buddy_order(pfn_to_page(outer_start)); + + /* + * outer_start page could be small order buddy page and + * it doesn't include start page. Adjust outer_start + * in this case to report failed page properly + * on tracepoint in test_pages_isolated() + */ + if (outer_start + (1UL << order) <= start) + outer_start = start; + } + + /* Make sure the range is really isolated. */ + if (test_pages_isolated(outer_start, end, 0)) { + ret = -EBUSY; + goto done; + } + + /* Grab isolated pages from freelists. */ + outer_end = isolate_freepages_range(&cc, outer_start, end); + if (!outer_end) { + ret = -EBUSY; + goto done; + } + + /* Free head and tail (if any) */ + if (start != outer_start) + free_contig_range(outer_start, start - outer_start); + if (end != outer_end) + free_contig_range(end, outer_end - end); + +done: + undo_isolate_page_range(start, end, migratetype); + return ret; +} +EXPORT_SYMBOL(alloc_contig_range); + +static int __alloc_contig_pages(unsigned long start_pfn, + unsigned long nr_pages, gfp_t gfp_mask) +{ + unsigned long end_pfn = start_pfn + nr_pages; + + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask); +} + +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + page = pfn_to_online_page(i); + if (!page) + return false; + + if (page_zone(page) != z) + return false; + + if (PageReserved(page)) + return false; + + if (PageHuge(page)) + return false; + } + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + + return zone_spans_pfn(zone, last_pfn); +} + +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask to limit search and used during compaction + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_range(). It scans over zones + * on an applicable zonelist to find a contiguous pfn range which can then be + * tried for allocation with alloc_contig_range(). This routine is intended + * for allocation requests which can not be fulfilled with the buddy allocator. + * + * The allocated memory is always aligned to a page boundary. If nr_pages is a + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). + * + * Allocated pages can be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + unsigned long ret, pfn, flags; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + spin_lock_irqsave(&zone->lock, flags); + + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_contig(zone, pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_contig_pages(pfn, nr_pages, + gfp_mask); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&zone->lock, flags); + } + pfn += nr_pages; + } + spin_unlock_irqrestore(&zone->lock, flags); + } + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ + +void free_contig_range(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long count = 0; + + for (; nr_pages--; pfn++) { + struct page *page = pfn_to_page(pfn); + + count += page_count(page) != 1; + __free_page(page); + } + WARN(count != 0, "%lu pages are still in use!\n", count); +} +EXPORT_SYMBOL(free_contig_range); + +/* + * Effectively disable pcplists for the zone by setting the high limit to 0 + * and draining all cpus. A concurrent page freeing on another CPU that's about + * to put the page on pcplist will either finish before the drain and the page + * will be drained, or observe the new high limit and skip the pcplist. + * + * Must be paired with a call to zone_pcp_enable(). + */ +void zone_pcp_disable(struct zone *zone) +{ + mutex_lock(&pcp_batch_high_lock); + __zone_set_pageset_high_and_batch(zone, 0, 1); + __drain_all_pages(zone, true); +} + +void zone_pcp_enable(struct zone *zone) +{ + __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); + mutex_unlock(&pcp_batch_high_lock); +} + +void zone_pcp_reset(struct zone *zone) +{ + int cpu; + struct per_cpu_zonestat *pzstats; + + if (zone->per_cpu_pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + drain_zonestat(zone, pzstats); + } + free_percpu(zone->per_cpu_pageset); + zone->per_cpu_pageset = &boot_pageset; + if (zone->per_cpu_zonestats != &boot_zonestats) { + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_zonestats = &boot_zonestats; + } + } +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* + * All pages in the range must be in a single zone, must not contain holes, + * must span full sections, and must be isolated before calling this function. + */ +void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn = start_pfn; + struct page *page; + struct zone *zone; + unsigned int order; + unsigned long flags; + + offline_mem_sections(pfn, end_pfn); + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + while (pfn < end_pfn) { + page = pfn_to_page(pfn); + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { + pfn++; + continue; + } + /* + * At this point all remaining PageOffline() pages have a + * reference count of 0 and can simply be skipped. + */ + if (PageOffline(page)) { + BUG_ON(page_count(page)); + BUG_ON(PageBuddy(page)); + pfn++; + continue; + } + + BUG_ON(page_count(page)); + BUG_ON(!PageBuddy(page)); + order = buddy_order(page); + del_page_from_free_list(page, zone, order); + pfn += (1 << order); + } + spin_unlock_irqrestore(&zone->lock, flags); +} +#endif + +/* + * This function returns a stable result only if called under zone lock. + */ +bool is_free_buddy_page(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned int order; + + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && + buddy_order_unsafe(page_head) >= order) + break; + } + + return order < MAX_ORDER; +} +EXPORT_SYMBOL(is_free_buddy_page); + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Break down a higher-order page in sub-pages, and keep our target out of + * buddy allocator. + */ +static void break_down_buddy_pages(struct zone *zone, struct page *page, + struct page *target, int low, int high, + int migratetype) +{ + unsigned long size = 1 << high; + struct page *current_buddy, *next_page; + + while (high > low) { + high--; + size >>= 1; + + if (target >= &page[size]) { + next_page = page + size; + current_buddy = page; + } else { + next_page = page; + current_buddy = page + size; + } + page = next_page; + + if (set_page_guard(zone, current_buddy, high, migratetype)) + continue; + + if (current_buddy != target) { + add_to_free_list(current_buddy, zone, high, migratetype); + set_buddy_order(current_buddy, high); + } + } +} + +/* + * Take a page that will be marked as poisoned off the buddy allocator. + */ +bool take_page_off_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + unsigned int order; + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + int page_order = buddy_order(page_head); + + if (PageBuddy(page_head) && page_order >= order) { + unsigned long pfn_head = page_to_pfn(page_head); + int migratetype = get_pfnblock_migratetype(page_head, + pfn_head); + + del_page_from_free_list(page_head, zone, page_order); + break_down_buddy_pages(zone, page_head, page, 0, + page_order, migratetype); + SetPageHWPoisonTakenOff(page); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -1, migratetype); + ret = true; + break; + } + if (page_count(page_head) > 0) + break; + } + spin_unlock_irqrestore(&zone->lock, flags); + return ret; +} + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} +#endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ diff --git a/mm/page_counter.c b/mm/page_counter.c new file mode 100644 index 000000000..db20d6452 --- /dev/null +++ b/mm/page_counter.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Lockless hierarchical page accounting & limiting + * + * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner + */ + +#include +#include +#include +#include +#include +#include +#include + +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) +{ + unsigned long protected, old_protected; + long delta; + + if (!c->parent) + return; + + protected = min(usage, READ_ONCE(c->min)); + old_protected = atomic_long_read(&c->min_usage); + if (protected != old_protected) { + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } + + protected = min(usage, READ_ONCE(c->low)); + old_protected = atomic_long_read(&c->low_usage); + if (protected != old_protected) { + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } +} + +/** + * page_counter_cancel - take pages out of the local counter + * @counter: counter + * @nr_pages: number of pages to cancel + */ +void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) +{ + long new; + + new = atomic_long_sub_return(nr_pages, &counter->usage); + /* More uncharges than charges? */ + if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n", + new, nr_pages)) { + new = 0; + atomic_long_set(&counter->usage, new); + } + propagate_protected_usage(counter, new); +} + +/** + * page_counter_charge - hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * + * NOTE: This does not consider any configured counter limits. + */ +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + + new = atomic_long_add_return(nr_pages, &c->usage); + propagate_protected_usage(c, new); + /* + * This is indeed racy, but we can live with some + * inaccuracy in the watermark. + */ + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); + } +} + +/** + * page_counter_try_charge - try to hierarchically charge pages + * @counter: counter + * @nr_pages: number of pages to charge + * @fail: points first counter to hit its limit, if any + * + * Returns %true on success, or %false and @fail if the counter or one + * of its ancestors has hit its configured limit. + */ +bool page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) { + long new; + /* + * Charge speculatively to avoid an expensive CAS. If + * a bigger charge fails, it might falsely lock out a + * racing smaller charge and send it into reclaim + * early, but the error is limited to the difference + * between the two sizes, which is less than 2M/4M in + * case of a THP locking out a regular page charge. + * + * The atomic_long_add_return() implies a full memory + * barrier between incrementing the count and reading + * the limit. When racing with page_counter_set_max(), + * we either see the new limit or the setter sees the + * counter has changed and retries. + */ + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); + /* + * This is racy, but we can live with some + * inaccuracy in the failcnt which is only used + * to report stats. + */ + data_race(c->failcnt++); + *fail = c; + goto failed; + } + propagate_protected_usage(c, new); + /* + * Just like with failcnt, we can live with some + * inaccuracy in the watermark. + */ + if (new > READ_ONCE(c->watermark)) + WRITE_ONCE(c->watermark, new); + } + return true; + +failed: + for (c = counter; c != *fail; c = c->parent) + page_counter_cancel(c, nr_pages); + + return false; +} + +/** + * page_counter_uncharge - hierarchically uncharge pages + * @counter: counter + * @nr_pages: number of pages to uncharge + */ +void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + for (c = counter; c; c = c->parent) + page_counter_cancel(c, nr_pages); +} + +/** + * page_counter_set_max - set the maximum number of pages allowed + * @counter: counter + * @nr_pages: limit to set + * + * Returns 0 on success, -EBUSY if the current number of pages on the + * counter already exceeds the specified limit. + * + * The caller must serialize invocations on the same counter. + */ +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) +{ + for (;;) { + unsigned long old; + long usage; + + /* + * Update the limit while making sure that it's not + * below the concurrently-changing counter value. + * + * The xchg implies two full memory barriers before + * and after, so the read-swap-read is ordered and + * ensures coherency with page_counter_try_charge(): + * that function modifies the count before checking + * the limit, so if it sees the old limit, we see the + * modified counter and retry. + */ + usage = page_counter_read(counter); + + if (usage > nr_pages) + return -EBUSY; + + old = xchg(&counter->max, nr_pages); + + if (page_counter_read(counter) <= usage || nr_pages >= old) + return 0; + + counter->max = old; + cond_resched(); + } +} + +/** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + WRITE_ONCE(counter->min, nr_pages); + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + WRITE_ONCE(counter->low, nr_pages); + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + +/** + * page_counter_memparse - memparse() for page counter limits + * @buf: string to parse + * @max: string meaning maximum possible value + * @nr_pages: returns the result in number of pages + * + * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be + * limited to %PAGE_COUNTER_MAX. + */ +int page_counter_memparse(const char *buf, const char *max, + unsigned long *nr_pages) +{ + char *end; + u64 bytes; + + if (!strcmp(buf, max)) { + *nr_pages = PAGE_COUNTER_MAX; + return 0; + } + + bytes = memparse(buf, &end); + if (*end != '\0') + return -EINVAL; + + *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); + + return 0; +} diff --git a/mm/page_ext.c b/mm/page_ext.c new file mode 100644 index 000000000..ddf196856 --- /dev/null +++ b/mm/page_ext.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * struct page extension + * + * This is the feature to manage memory for extended data per page. + * + * Until now, we must modify struct page itself to store extra data per page. + * This requires rebuilding the kernel and it is really time consuming process. + * And, sometimes, rebuild is impossible due to third party module dependency. + * At last, enlarging struct page could cause un-wanted system behaviour change. + * + * This feature is intended to overcome above mentioned problems. This feature + * allocates memory for extended data per page in certain place rather than + * the struct page itself. This memory can be accessed by the accessor + * functions provided by this code. During the boot process, it checks whether + * allocation of huge chunk of memory is needed or not. If not, it avoids + * allocating memory at all. With this advantage, we can include this feature + * into the kernel in default and can avoid rebuild and solve related problems. + * + * To help these things to work well, there are two callbacks for clients. One + * is the need callback which is mandatory if user wants to avoid useless + * memory allocation at boot-time. The other is optional, init callback, which + * is used to do proper initialization after memory is allocated. + * + * The need callback is used to decide whether extended memory allocation is + * needed or not. Sometimes users want to deactivate some features in this + * boot and extra memory would be unnecessary. In this case, to avoid + * allocating huge chunk of memory, each clients represent their need of + * extra memory through the need callback. If one of the need callbacks + * returns true, it means that someone needs extra memory so that + * page extension core should allocates memory for page extension. If + * none of need callbacks return true, memory isn't needed at all in this boot + * and page extension core can skip to allocate memory. As result, + * none of memory is wasted. + * + * When need callback returns true, page_ext checks if there is a request for + * extra memory through size in struct page_ext_operations. If it is non-zero, + * extra space is allocated for each page_ext entry and offset is returned to + * user through offset in struct page_ext_operations. + * + * The init callback is used to do proper initialization after page extension + * is completely initialized. In sparse memory system, extra memory is + * allocated some time later than memmap is allocated. In other words, lifetime + * of memory for page extension isn't same with memmap for struct page. + * Therefore, clients can't store extra data until page extension is + * initialized, even if pages are allocated and used freely. This could + * cause inadequate state of extra data per page, so, to prevent it, client + * can utilize this callback to initialize the state of it correctly. + */ + +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) +static bool need_page_idle(void) +{ + return true; +} +static struct page_ext_operations page_idle_ops __initdata = { + .need = need_page_idle, +}; +#endif + +static struct page_ext_operations *page_ext_ops[] __initdata = { +#ifdef CONFIG_PAGE_OWNER + &page_owner_ops, +#endif +#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) + &page_idle_ops, +#endif +#ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, +#endif +}; + +unsigned long page_ext_size = sizeof(struct page_ext); + +static unsigned long total_usage; +static struct page_ext *lookup_page_ext(const struct page *page); + +bool early_page_ext; +static int __init setup_early_page_ext(char *str) +{ + early_page_ext = true; + return 0; +} +early_param("early_page_ext", setup_early_page_ext); + +static bool __init invoke_need_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + bool need = false; + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->need && page_ext_ops[i]->need()) { + page_ext_ops[i]->offset = page_ext_size; + page_ext_size += page_ext_ops[i]->size; + need = true; + } + } + + return need; +} + +static void __init invoke_init_callbacks(void) +{ + int i; + int entries = ARRAY_SIZE(page_ext_ops); + + for (i = 0; i < entries; i++) { + if (page_ext_ops[i]->init) + page_ext_ops[i]->init(); + } +} + +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) +{ + invoke_init_callbacks(); +} +#endif + +static inline struct page_ext *get_entry(void *base, unsigned long index) +{ + return base + page_ext_size * index; +} + +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext: Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} +#ifndef CONFIG_SPARSEMEM + + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ + pgdat->node_page_ext = NULL; +} + +static struct page_ext *lookup_page_ext(const struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long index; + struct page_ext *base; + + WARN_ON_ONCE(!rcu_read_lock_held()); + base = NODE_DATA(page_to_nid(page))->node_page_ext; + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (unlikely(!base)) + return NULL; + index = pfn - round_down(node_start_pfn(page_to_nid(page)), + MAX_ORDER_NR_PAGES); + return get_entry(base, index); +} + +static int __init alloc_node_page_ext(int nid) +{ + struct page_ext *base; + unsigned long table_size; + unsigned long nr_pages; + + nr_pages = NODE_DATA(nid)->node_spanned_pages; + if (!nr_pages) + return 0; + + /* + * Need extra space if node range is not aligned with + * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm + * checks buddy's status, range could be out of exact node range. + */ + if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) || + !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES)) + nr_pages += MAX_ORDER_NR_PAGES; + + table_size = page_ext_size * nr_pages; + + base = memblock_alloc_try_nid( + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!base) + return -ENOMEM; + NODE_DATA(nid)->node_page_ext = base; + total_usage += table_size; + return 0; +} + +void __init page_ext_init_flatmem(void) +{ + + int nid, fail; + + if (!invoke_need_callbacks()) + return; + + for_each_online_node(nid) { + fail = alloc_node_page_ext(nid); + if (fail) + goto fail; + } + pr_info("allocated %ld bytes of page_ext\n", total_usage); + return; + +fail: + pr_crit("allocation of page_ext failed.\n"); + panic("Out of memory"); +} + +#else /* CONFIG_SPARSEMEM */ +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} + +static struct page_ext *lookup_page_ext(const struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); + /* + * The sanity checks the page allocator does upon freeing a + * page can reach here before the page_ext arrays are + * allocated when feeding a range of pages to the allocator + * for the first time during bootup or memory hotplug. + */ + if (page_ext_invalid(page_ext)) + return NULL; + return get_entry(page_ext, pfn); +} + +static void *__meminit alloc_page_ext(size_t size, int nid) +{ + gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; + void *addr = NULL; + + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); + return addr; + } + + addr = vzalloc_node(size, nid); + + return addr; +} + +static int __meminit init_section_page_ext(unsigned long pfn, int nid) +{ + struct mem_section *section; + struct page_ext *base; + unsigned long table_size; + + section = __pfn_to_section(pfn); + + if (section->page_ext) + return 0; + + table_size = page_ext_size * PAGES_PER_SECTION; + base = alloc_page_ext(table_size, nid); + + /* + * The value stored in section->page_ext is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); + + if (!base) { + pr_err("page ext allocation failure\n"); + return -ENOMEM; + } + + /* + * The passed "pfn" may not be aligned to SECTION. For the calculation + * we need to apply a mask. + */ + pfn &= PAGE_SECTION_MASK; + section->page_ext = (void *)base - page_ext_size * pfn; + total_usage += table_size; + return 0; +} + +static void free_page_ext(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size; + + table_size = page_ext_size * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + kmemleak_free(addr); + free_pages_exact(addr, table_size); + } +} + +static void __free_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + struct page_ext *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); + free_page_ext(base); +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); +} + +static int __meminit online_page_ext(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + if (nid == NUMA_NO_NODE) { + /* + * In this case, "nid" already exists and contains valid memory. + * "start_pfn" passed to us is a pfn which is an arg for + * online__pages(), and start_pfn should exist. + */ + nid = pfn_to_nid(start_pfn); + VM_BUG_ON(!node_online(nid)); + } + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) + fail = init_section_page_ext(pfn, nid); + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + + return -ENOMEM; +} + +static int __meminit offline_page_ext(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long start, end, pfn; + + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); + + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_ext(pfn); + return 0; + +} + +static int __meminit page_ext_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_ext(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_OFFLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages); + break; + case MEM_CANCEL_ONLINE: + offline_page_ext(mn->start_pfn, + mn->nr_pages); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + + return notifier_from_errno(ret); +} + +void __init page_ext_init(void) +{ + unsigned long pfn; + int nid; + + if (!invoke_need_callbacks()) + return; + + for_each_node_state(nid, N_MEMORY) { + unsigned long start_pfn, end_pfn; + + start_pfn = node_start_pfn(nid); + end_pfn = node_end_pfn(nid); + /* + * start_pfn and end_pfn may not be aligned to SECTION and the + * page->flags of out of node pages are not initialized. So we + * scan [start_pfn, the biggest section's pfn < end_pfn) here. + */ + for (pfn = start_pfn; pfn < end_pfn; + pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { + + if (!pfn_valid(pfn)) + continue; + /* + * Nodes's pfns can be overlapping. + * We know some arch can have a nodes layout such as + * -------------pfn--------------> + * N0 | N1 | N2 | N0 | N1 | N2|.... + */ + if (pfn_to_nid(pfn) != nid) + continue; + if (init_section_page_ext(pfn, nid)) + goto oom; + cond_resched(); + } + } + hotplug_memory_notifier(page_ext_callback, 0); + pr_info("allocated %ld bytes of page_ext\n", total_usage); + invoke_init_callbacks(); + return; + +oom: + panic("Out of memory"); +} + +void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) +{ +} + +#endif diff --git a/mm/page_idle.c b/mm/page_idle.c new file mode 100644 index 000000000..bc08332a6 --- /dev/null +++ b/mm/page_idle.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define BITMAP_CHUNK_SIZE sizeof(u64) +#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) + +/* + * Idle page tracking only considers user memory pages, for other types of + * pages the idle flag is always unset and an attempt to set it is silently + * ignored. + * + * We treat a page as a user memory page if it is on an LRU list, because it is + * always safe to pass such a page to rmap_walk(), which is essential for idle + * page tracking. With such an indicator of user pages we can skip isolated + * pages, but since there are not usually many of them, it will hardly affect + * the overall result. + * + * This function tries to get a user memory page by pfn as described above. + */ +static struct page *page_idle_get_page(unsigned long pfn) +{ + struct page *page = pfn_to_online_page(pfn); + + if (!page || !PageLRU(page) || + !get_page_unless_zero(page)) + return NULL; + + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + return page; +} + +static bool page_idle_clear_pte_refs_one(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0); + bool referenced = false; + + while (page_vma_mapped_walk(&pvmw)) { + addr = pvmw.address; + if (pvmw.pte) { + /* + * For PTE-mapped THP, one sub page is referenced, + * the whole THP is referenced. + */ + if (ptep_clear_young_notify(vma, addr, pvmw.pte)) + referenced = true; + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) + referenced = true; + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); + } + } + + if (referenced) { + folio_clear_idle(folio); + /* + * We cleared the referenced bit in a mapping to this page. To + * avoid interference with page reclaim, mark it young so that + * folio_referenced() will return > 0. + */ + folio_set_young(folio); + } + return true; +} + +static void page_idle_clear_pte_refs(struct page *page) +{ + struct folio *folio = page_folio(page); + + /* + * Since rwc.try_lock is unused, rwc is effectively immutable, so we + * can make it static to save some cycles and stack. + */ + static struct rmap_walk_control rwc = { + .rmap_one = page_idle_clear_pte_refs_one, + .anon_lock = folio_lock_anon_vma_read, + }; + bool need_lock; + + if (!folio_mapped(folio) || !folio_raw_mapping(folio)) + return; + + need_lock = !folio_test_anon(folio) || folio_test_ksm(folio); + if (need_lock && !folio_trylock(folio)) + return; + + rmap_walk(folio, &rwc); + + if (need_lock) + folio_unlock(folio); +} + +static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + u64 *out = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return 0; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = max_pfn; + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if (!bit) + *out = 0ULL; + page = page_idle_get_page(pfn); + if (page) { + if (page_is_idle(page)) { + /* + * The page might have been referenced via a + * pte, in which case it is not idle. Clear + * refs and recheck. + */ + page_idle_clear_pte_refs(page); + if (page_is_idle(page)) + *out |= 1ULL << bit; + } + put_page(page); + } + if (bit == BITMAP_CHUNK_BITS - 1) + out++; + cond_resched(); + } + return (char *)out - buf; +} + +static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + const u64 *in = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return -ENXIO; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = max_pfn; + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if ((*in >> bit) & 1) { + page = page_idle_get_page(pfn); + if (page) { + page_idle_clear_pte_refs(page); + set_page_idle(page); + put_page(page); + } + } + if (bit == BITMAP_CHUNK_BITS - 1) + in++; + cond_resched(); + } + return (char *)in - buf; +} + +static struct bin_attribute page_idle_bitmap_attr = + __BIN_ATTR(bitmap, 0600, + page_idle_bitmap_read, page_idle_bitmap_write, 0); + +static struct bin_attribute *page_idle_bin_attrs[] = { + &page_idle_bitmap_attr, + NULL, +}; + +static const struct attribute_group page_idle_attr_group = { + .bin_attrs = page_idle_bin_attrs, + .name = "page_idle", +}; + +static int __init page_idle_init(void) +{ + int err; + + err = sysfs_create_group(mm_kobj, &page_idle_attr_group); + if (err) { + pr_err("page_idle: register sysfs failed\n"); + return err; + } + return 0; +} +subsys_initcall(page_idle_init); diff --git a/mm/page_io.c b/mm/page_io.c new file mode 100644 index 000000000..3a5f921b9 --- /dev/null +++ b/mm/page_io.c @@ -0,0 +1,537 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/page_io.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, + * Asynchronous swapping added 30.12.95. Stephen Tweedie + * Removed race in async swapping. 14.4.1996. Bruno Haible + * Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie + * Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "swap.h" + +static void end_swap_bio_write(struct bio *bio) +{ + struct page *page = bio_first_page_all(bio); + + if (bio->bi_status) { + SetPageError(page); + /* + * We failed to write the page out to swap-space. + * Re-dirty the page in order to avoid it being reclaimed. + * Also print a dire warning that things will go BAD (tm) + * very quickly. + * + * Also clear PG_reclaim to avoid folio_rotate_reclaimable() + */ + set_page_dirty(page); + pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); + ClearPageReclaim(page); + } + end_page_writeback(page); + bio_put(bio); +} + +static void end_swap_bio_read(struct bio *bio) +{ + struct page *page = bio_first_page_all(bio); + struct task_struct *waiter = bio->bi_private; + + if (bio->bi_status) { + SetPageError(page); + ClearPageUptodate(page); + pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); + goto out; + } + + SetPageUptodate(page); +out: + unlock_page(page); + WRITE_ONCE(bio->bi_private, NULL); + bio_put(bio); + if (waiter) { + blk_wake_io_task(waiter); + put_task_struct(waiter); + } +} + +int generic_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + sector_t lowest_block = -1; + sector_t highest_block = 0; + int nr_extents = 0; + int ret; + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent tree. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = i_size_read(inode) >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + cond_resched(); + + first_block = probe_block; + ret = bmap(inode, &first_block); + if (ret || !first_block) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = probe_block + block_in_page; + ret = bmap(inode, &block); + if (ret || !block) + goto bad_bmap; + + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + first_block >>= (PAGE_SHIFT - blkbits); + if (page_no) { /* exclude the header page */ + if (first_block < lowest_block) + lowest_block = first_block; + if (first_block > highest_block) + highest_block = first_block; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; +bad_bmap: + pr_err("swapon: swapfile has holes\n"); + ret = -EINVAL; + goto out; +} + +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ +int swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct folio *folio = page_folio(page); + int ret = 0; + + if (folio_free_swap(folio)) { + folio_unlock(folio); + goto out; + } + /* + * Arch code may have to preserve more data than just the page + * contents, e.g. memory tags. + */ + ret = arch_prepare_to_swap(&folio->page); + if (ret) { + folio_mark_dirty(folio); + folio_unlock(folio); + goto out; + } + if (frontswap_store(&folio->page) == 0) { + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); + goto out; + } + ret = __swap_writepage(&folio->page, wbc); +out: + return ret; +} + +static inline void count_swpout_vm_event(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (unlikely(PageTransHuge(page))) + count_vm_event(THP_SWPOUT); +#endif + count_vm_events(PSWPOUT, thp_nr_pages(page)); +} + +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + memcg = page_memcg(page); + if (!memcg) + return; + + rcu_read_lock(); + css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + rcu_read_unlock(); +} +#else +#define bio_associate_blkg_from_page(bio, page) do { } while (0) +#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ + +struct swap_iocb { + struct kiocb iocb; + struct bio_vec bvec[SWAP_CLUSTER_MAX]; + int pages; + int len; +}; +static mempool_t *sio_pool; + +int sio_pool_init(void) +{ + if (!sio_pool) { + mempool_t *pool = mempool_create_kmalloc_pool( + SWAP_CLUSTER_MAX, sizeof(struct swap_iocb)); + if (cmpxchg(&sio_pool, NULL, pool)) + mempool_destroy(pool); + } + if (!sio_pool) + return -ENOMEM; + return 0; +} + +static void sio_write_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + struct page *page = sio->bvec[0].bv_page; + int p; + + if (ret != sio->len) { + /* + * In the case of swap-over-nfs, this can be a + * temporary failure if the system has limited + * memory for allocating transmit buffers. + * Mark the page dirty and avoid + * folio_rotate_reclaimable but rate-limit the + * messages but do not flag PageError like + * the normal direct-to-bio case as it could + * be temporary. + */ + pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n", + ret, page_file_offset(page)); + for (p = 0; p < sio->pages; p++) { + page = sio->bvec[p].bv_page; + set_page_dirty(page); + ClearPageReclaim(page); + } + } else { + for (p = 0; p < sio->pages; p++) + count_swpout_vm_event(sio->bvec[p].bv_page); + } + + for (p = 0; p < sio->pages; p++) + end_page_writeback(sio->bvec[p].bv_page); + + mempool_free(sio, sio_pool); +} + +static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) +{ + struct swap_iocb *sio = NULL; + struct swap_info_struct *sis = page_swap_info(page); + struct file *swap_file = sis->swap_file; + loff_t pos = page_file_offset(page); + + set_page_writeback(page); + unlock_page(page); + if (wbc->swap_plug) + sio = *wbc->swap_plug; + if (sio) { + if (sio->iocb.ki_filp != swap_file || + sio->iocb.ki_pos + sio->len != pos) { + swap_write_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_NOIO); + init_sync_kiocb(&sio->iocb, swap_file); + sio->iocb.ki_complete = sio_write_complete; + sio->iocb.ki_pos = pos; + sio->pages = 0; + sio->len = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = thp_size(page); + sio->bvec[sio->pages].bv_offset = 0; + sio->len += thp_size(page); + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { + swap_write_unplug(sio); + sio = NULL; + } + if (wbc->swap_plug) + *wbc->swap_plug = sio; + + return 0; +} + +int __swap_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio; + int ret; + struct swap_info_struct *sis = page_swap_info(page); + + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + /* + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + if (data_race(sis->flags & SWP_FS_OPS)) + return swap_writepage_fs(page, wbc); + + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); + if (!ret) { + count_swpout_vm_event(page); + return 0; + } + + bio = bio_alloc(sis->bdev, 1, + REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), + GFP_NOIO); + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_write; + bio_add_page(bio, page, thp_size(page), 0); + + bio_associate_blkg_from_page(bio, page); + count_swpout_vm_event(page); + set_page_writeback(page); + unlock_page(page); + submit_bio(bio); + + return 0; +} + +void swap_write_unplug(struct swap_iocb *sio) +{ + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; + + iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_write_complete(&sio->iocb, ret); +} + +static void sio_read_complete(struct kiocb *iocb, long ret) +{ + struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb); + int p; + + if (ret == sio->len) { + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + + SetPageUptodate(page); + unlock_page(page); + } + count_vm_events(PSWPIN, sio->pages); + } else { + for (p = 0; p < sio->pages; p++) { + struct page *page = sio->bvec[p].bv_page; + + SetPageError(page); + ClearPageUptodate(page); + unlock_page(page); + } + pr_alert_ratelimited("Read-error on swap-device\n"); + } + mempool_free(sio, sio_pool); +} + +static void swap_readpage_fs(struct page *page, + struct swap_iocb **plug) +{ + struct swap_info_struct *sis = page_swap_info(page); + struct swap_iocb *sio = NULL; + loff_t pos = page_file_offset(page); + + if (plug) + sio = *plug; + if (sio) { + if (sio->iocb.ki_filp != sis->swap_file || + sio->iocb.ki_pos + sio->len != pos) { + swap_read_unplug(sio); + sio = NULL; + } + } + if (!sio) { + sio = mempool_alloc(sio_pool, GFP_KERNEL); + init_sync_kiocb(&sio->iocb, sis->swap_file); + sio->iocb.ki_pos = pos; + sio->iocb.ki_complete = sio_read_complete; + sio->pages = 0; + sio->len = 0; + } + sio->bvec[sio->pages].bv_page = page; + sio->bvec[sio->pages].bv_len = thp_size(page); + sio->bvec[sio->pages].bv_offset = 0; + sio->len += thp_size(page); + sio->pages += 1; + if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { + swap_read_unplug(sio); + sio = NULL; + } + if (plug) + *plug = sio; +} + +int swap_readpage(struct page *page, bool synchronous, + struct swap_iocb **plug) +{ + struct bio *bio; + int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); + bool workingset = PageWorkingset(page); + unsigned long pflags; + bool in_thrashing; + + VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageUptodate(page), page); + + /* + * Count submission time as memory stall and delay. When the device + * is congested, or the submitting cgroup IO-throttled, submission + * can be a significant part of overall IO time. + */ + if (workingset) { + delayacct_thrashing_start(&in_thrashing); + psi_memstall_enter(&pflags); + } + delayacct_swapin_start(); + + if (frontswap_load(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } + + if (data_race(sis->flags & SWP_FS_OPS)) { + swap_readpage_fs(page, plug); + goto out; + } + + if (sis->flags & SWP_SYNCHRONOUS_IO) { + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + count_vm_event(PSWPIN); + goto out; + } + } + + ret = 0; + bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + /* + * Keep this task valid during swap readpage because the oom killer may + * attempt to access it in the page fault retry time check. + */ + if (synchronous) { + get_task_struct(current); + bio->bi_private = current; + } + count_vm_event(PSWPIN); + bio_get(bio); + submit_bio(bio); + while (synchronous) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) + break; + + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + bio_put(bio); + +out: + if (workingset) { + delayacct_thrashing_end(&in_thrashing); + psi_memstall_leave(&pflags); + } + delayacct_swapin_end(); + return ret; +} + +void __swap_read_unplug(struct swap_iocb *sio) +{ + struct iov_iter from; + struct address_space *mapping = sio->iocb.ki_filp->f_mapping; + int ret; + + iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); + ret = mapping->a_ops->swap_rw(&sio->iocb, &from); + if (ret != -EIOCBQUEUED) + sio_read_complete(&sio->iocb, ret); +} diff --git a/mm/page_isolation.c b/mm/page_isolation.c new file mode 100644 index 000000000..47fbc1696 --- /dev/null +++ b/mm/page_isolation.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/page_isolation.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +/* + * This function checks whether the range [start_pfn, end_pfn) includes + * unmovable pages or not. The range must fall into a single pageblock and + * consequently belong to a single zone. + * + * PageLRU check without isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. + * + * Returns a page without holding a reference. If the caller wants to + * dereference that page (e.g., dumping), it has to make sure that it + * cannot get removed (e.g., via memory unplug) concurrently. + * + */ +static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags) +{ + struct page *page = pfn_to_page(start_pfn); + struct zone *zone = page_zone(page); + unsigned long pfn; + + VM_BUG_ON(pageblock_start_pfn(start_pfn) != + pageblock_start_pfn(end_pfn - 1)); + + if (is_migrate_cma_page(page)) { + /* + * CMA allocations (alloc_contig_range) really need to mark + * isolate CMA pageblocks even when they are not movable in fact + * so consider them movable here. + */ + if (is_migrate_cma(migratetype)) + return NULL; + + return page; + } + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + page = pfn_to_page(pfn); + + /* + * Both, bootmem allocations and memory holes are marked + * PG_reserved and are unmovable. We can even have unmovable + * allocations inside ZONE_MOVABLE, for example when + * specifying "movablecore". + */ + if (PageReserved(page)) + return page; + + /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + continue; + + /* + * Hugepages are not in LRU lists, but they're movable. + * THPs are on the LRU, but need to be counted as #small pages. + * We need not scan over tail pages because we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page) || PageTransCompound(page)) { + struct page *head = compound_head(page); + unsigned int skip_pages; + + if (PageHuge(page)) { + if (!hugepage_migration_supported(page_hstate(head))) + return page; + } else if (!PageLRU(head) && !__PageMovable(head)) { + return page; + } + + skip_pages = compound_nr(head) - (page - head); + pfn += skip_pages - 1; + continue; + } + + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_refcount is zero at all time. + */ + if (!page_ref_count(page)) { + if (PageBuddy(page)) + pfn += (1 << buddy_order(page)) - 1; + continue; + } + + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + continue; + + /* + * We treat all PageOffline() pages as movable when offlining + * to give drivers a chance to decrement their reference count + * in MEM_GOING_OFFLINE in order to indicate that these pages + * can be offlined as there are no direct references anymore. + * For actually unmovable PageOffline() where the driver does + * not support this, we will fail later when trying to actually + * move these pages that still have a reference count > 0. + * (false negatives in this function only) + */ + if ((flags & MEMORY_OFFLINE) && PageOffline(page)) + continue; + + if (__PageMovable(page) || PageLRU(page)) + continue; + + /* + * If there are RECLAIMABLE pages, we need to check + * it. But now, memory offline itself doesn't call + * shrink_node_slabs() and it still to be fixed. + */ + return page; + } + return NULL; +} + +/* + * This function set pageblock migratetype to isolate if no unmovable page is + * present in [start_pfn, end_pfn). The pageblock must intersect with + * [start_pfn, end_pfn). + */ +static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags, + unsigned long start_pfn, unsigned long end_pfn) +{ + struct zone *zone = page_zone(page); + struct page *unmovable; + unsigned long flags; + unsigned long check_unmovable_start, check_unmovable_end; + + spin_lock_irqsave(&zone->lock, flags); + + /* + * We assume the caller intended to SET migrate type to isolate. + * If it is already set, then someone else must have raced and + * set it before us. + */ + if (is_migrate_isolate_page(page)) { + spin_unlock_irqrestore(&zone->lock, flags); + return -EBUSY; + } + + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + * + * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock + * to avoid redundant checks. + */ + check_unmovable_start = max(page_to_pfn(page), start_pfn); + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), + end_pfn); + + unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, + migratetype, isol_flags); + if (!unmovable) { + unsigned long nr_pages; + int mt = get_pageblock_migratetype(page); + + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + zone->nr_isolate_pageblock++; + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); + + __mod_zone_freepage_state(zone, -nr_pages, mt); + spin_unlock_irqrestore(&zone->lock, flags); + return 0; + } + + spin_unlock_irqrestore(&zone->lock, flags); + if (isol_flags & REPORT_FAILURE) { + /* + * printk() with zone->lock held will likely trigger a + * lockdep splat, so defer it here. + */ + dump_page(unmovable, "unmovable page"); + } + + return -EBUSY; +} + +static void unset_migratetype_isolate(struct page *page, int migratetype) +{ + struct zone *zone; + unsigned long flags, nr_pages; + bool isolated_page = false; + unsigned int order; + struct page *buddy; + + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + if (!is_migrate_isolate_page(page)) + goto out; + + /* + * Because freepage with more than pageblock_order on isolated + * pageblock is restricted to merge due to freepage counting problem, + * it is possible that there is free buddy page. + * move_freepages_block() doesn't care of merge so we need other + * approach in order to merge them. Isolation and free will make + * these pages to be merged. + */ + if (PageBuddy(page)) { + order = buddy_order(page); + if (order >= pageblock_order && order < MAX_ORDER - 1) { + buddy = find_buddy_page_pfn(page, page_to_pfn(page), + order, NULL); + if (buddy && !is_migrate_isolate_page(buddy)) { + isolated_page = !!__isolate_free_page(page, order); + /* + * Isolating a free page in an isolated pageblock + * is expected to always work as watermarks don't + * apply here. + */ + VM_WARN_ON(!isolated_page); + } + } + } + + /* + * If we isolate freepage with more than pageblock_order, there + * should be no freepage in the range, so we could avoid costly + * pageblock scanning for freepage moving. + * + * We didn't actually touch any of the isolated pages, so place them + * to the tail of the freelist. This is an optimization for memory + * onlining - just onlined memory won't immediately be considered for + * allocation. + */ + if (!isolated_page) { + nr_pages = move_freepages_block(zone, page, migratetype, NULL); + __mod_zone_freepage_state(zone, nr_pages, migratetype); + } + set_pageblock_migratetype(page, migratetype); + if (isolated_page) + __putback_isolated_page(page, order, migratetype); + zone->nr_isolate_pageblock--; +out: + spin_unlock_irqrestore(&zone->lock, flags); +} + +static inline struct page * +__first_valid_page(unsigned long pfn, unsigned long nr_pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + struct page *page; + + page = pfn_to_online_page(pfn + i); + if (!page) + continue; + return page; + } + return NULL; +} + +/** + * isolate_single_pageblock() -- tries to isolate a pageblock that might be + * within a free or in-use page. + * @boundary_pfn: pageblock-aligned pfn that a page might cross + * @flags: isolation flags + * @gfp_flags: GFP flags used for migrating pages + * @isolate_before: isolate the pageblock before the boundary_pfn + * @skip_isolation: the flag to skip the pageblock isolation in second + * isolate_single_pageblock() + * @migratetype: migrate type to set in error recovery. + * + * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one + * pageblock. When not all pageblocks within a page are isolated at the same + * time, free page accounting can go wrong. For example, in the case of + * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks. + * [ MAX_ORDER-1 ] + * [ pageblock0 | pageblock1 ] + * When either pageblock is isolated, if it is a free page, the page is not + * split into separate migratetype lists, which is supposed to; if it is an + * in-use page and freed later, __free_one_page() does not split the free page + * either. The function handles this by splitting the free page or migrating + * the in-use page then splitting the free page. + */ +static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, + gfp_t gfp_flags, bool isolate_before, bool skip_isolation, + int migratetype) +{ + unsigned long start_pfn; + unsigned long isolate_pageblock; + unsigned long pfn; + struct zone *zone; + int ret; + + VM_BUG_ON(!pageblock_aligned(boundary_pfn)); + + if (isolate_before) + isolate_pageblock = boundary_pfn - pageblock_nr_pages; + else + isolate_pageblock = boundary_pfn; + + /* + * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid + * only isolating a subset of pageblocks from a bigger than pageblock + * free or in-use page. Also make sure all to-be-isolated pageblocks + * are within the same zone. + */ + zone = page_zone(pfn_to_page(isolate_pageblock)); + start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES), + zone->zone_start_pfn); + + if (skip_isolation) { + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + + VM_BUG_ON(!is_migrate_isolate(mt)); + } else { + ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype, + flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages); + + if (ret) + return ret; + } + + /* + * Bail out early when the to-be-isolated pageblock does not form + * a free or in-use page across boundary_pfn: + * + * 1. isolate before boundary_pfn: the page after is not online + * 2. isolate after boundary_pfn: the page before is not online + * + * This also ensures correctness. Without it, when isolate after + * boundary_pfn and [start_pfn, boundary_pfn) are not online, + * __first_valid_page() will return unexpected NULL in the for loop + * below. + */ + if (isolate_before) { + if (!pfn_to_online_page(boundary_pfn)) + return 0; + } else { + if (!pfn_to_online_page(boundary_pfn - 1)) + return 0; + } + + for (pfn = start_pfn; pfn < boundary_pfn;) { + struct page *page = __first_valid_page(pfn, boundary_pfn - pfn); + + VM_BUG_ON(!page); + pfn = page_to_pfn(page); + /* + * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any + * free pages in [start_pfn, boundary_pfn), its head page will + * always be in the range. + */ + if (PageBuddy(page)) { + int order = buddy_order(page); + + if (pfn + (1UL << order) > boundary_pfn) { + /* free page changed before split, check it again */ + if (split_free_page(page, order, boundary_pfn - pfn)) + continue; + } + + pfn += 1UL << order; + continue; + } + /* + * migrate compound pages then let the free page handling code + * above do the rest. If migration is not possible, just fail. + */ + if (PageCompound(page)) { + struct page *head = compound_head(page); + unsigned long head_pfn = page_to_pfn(head); + unsigned long nr_pages = compound_nr(head); + + if (head_pfn + nr_pages <= boundary_pfn) { + pfn = head_pfn + nr_pages; + continue; + } +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* + * hugetlb, lru compound (THP), and movable compound pages + * can be migrated. Otherwise, fail the isolation. + */ + if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) { + int order; + unsigned long outer_pfn; + int page_mt = get_pageblock_migratetype(page); + bool isolate_page = !is_migrate_isolate_page(page); + struct compact_control cc = { + .nr_migratepages = 0, + .order = -1, + .zone = page_zone(pfn_to_page(head_pfn)), + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .no_set_skip_hint = true, + .gfp_mask = gfp_flags, + .alloc_contig = true, + }; + INIT_LIST_HEAD(&cc.migratepages); + + /* + * XXX: mark the page as MIGRATE_ISOLATE so that + * no one else can grab the freed page after migration. + * Ideally, the page should be freed as two separate + * pages to be added into separate migratetype free + * lists. + */ + if (isolate_page) { + ret = set_migratetype_isolate(page, page_mt, + flags, head_pfn, head_pfn + nr_pages); + if (ret) + goto failed; + } + + ret = __alloc_contig_migrate_range(&cc, head_pfn, + head_pfn + nr_pages); + + /* + * restore the page's migratetype so that it can + * be split into separate migratetype free lists + * later. + */ + if (isolate_page) + unset_migratetype_isolate(page, page_mt); + + if (ret) + goto failed; + /* + * reset pfn to the head of the free page, so + * that the free page handling code above can split + * the free page to the right migratetype list. + * + * head_pfn is not used here as a hugetlb page order + * can be bigger than MAX_ORDER-1, but after it is + * freed, the free page order is not. Use pfn within + * the range to find the head of the free page. + */ + order = 0; + outer_pfn = pfn; + while (!PageBuddy(pfn_to_page(outer_pfn))) { + /* stop if we cannot find the free page */ + if (++order >= MAX_ORDER) + goto failed; + outer_pfn &= ~0UL << order; + } + pfn = outer_pfn; + continue; + } else +#endif + goto failed; + } + + pfn++; + } + return 0; +failed: + /* restore the original migratetype */ + if (!skip_isolation) + unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype); + return -EBUSY; +} + +/** + * start_isolate_page_range() - make page-allocation-type of range of pages to + * be MIGRATE_ISOLATE. + * @start_pfn: The lower PFN of the range to be isolated. + * @end_pfn: The upper PFN of the range to be isolated. + * @migratetype: Migrate type to set in error recovery. + * @flags: The following flags are allowed (they can be combined in + * a bit mask) + * MEMORY_OFFLINE - isolate to offline (!allocate) memory + * e.g., skip over PageHWPoison() pages + * and PageOffline() pages. + * REPORT_FAILURE - report details about the failure to + * isolate the range + * @gfp_flags: GFP flags used for migrating pages that sit across the + * range boundaries. + * + * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in + * the range will never be allocated. Any free pages and pages freed in the + * future will not be allocated again. If specified range includes migrate types + * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all + * pages in the range finally, the caller have to free all pages in the range. + * test_page_isolated() can be used for test it. + * + * The function first tries to isolate the pageblocks at the beginning and end + * of the range, since there might be pages across the range boundaries. + * Afterwards, it isolates the rest of the range. + * + * There is no high level synchronization mechanism that prevents two threads + * from trying to isolate overlapping ranges. If this happens, one thread + * will notice pageblocks in the overlapping range already set to isolate. + * This happens in set_migratetype_isolate, and set_migratetype_isolate + * returns an error. We then clean up by restoring the migration type on + * pageblocks we may have modified and return -EBUSY to caller. This + * prevents two threads from simultaneously working on overlapping ranges. + * + * Please note that there is no strong synchronization with the page allocator + * either. Pages might be freed while their page blocks are marked ISOLATED. + * A call to drain_all_pages() after isolation can flush most of them. However + * in some cases pages might still end up on pcp lists and that would allow + * for their allocation even when they are in fact isolated already. Depending + * on how strong of a guarantee the caller needs, zone_pcp_disable/enable() + * might be used to flush and disable pcplist before isolation and enable after + * unisolation. + * + * Return: 0 on success and -EBUSY if any part of range cannot be isolated. + */ +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype, int flags, gfp_t gfp_flags) +{ + unsigned long pfn; + struct page *page; + /* isolation is done at page block granularity */ + unsigned long isolate_start = pageblock_start_pfn(start_pfn); + unsigned long isolate_end = pageblock_align(end_pfn); + int ret; + bool skip_isolation = false; + + /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ + ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, + skip_isolation, migratetype); + if (ret) + return ret; + + if (isolate_start == isolate_end - pageblock_nr_pages) + skip_isolation = true; + + /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ + ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, + skip_isolation, migratetype); + if (ret) { + unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); + return ret; + } + + /* skip isolated pageblocks at the beginning and end */ + for (pfn = isolate_start + pageblock_nr_pages; + pfn < isolate_end - pageblock_nr_pages; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && set_migratetype_isolate(page, migratetype, flags, + start_pfn, end_pfn)) { + undo_isolate_page_range(isolate_start, pfn, migratetype); + unset_migratetype_isolate( + pfn_to_page(isolate_end - pageblock_nr_pages), + migratetype); + return -EBUSY; + } + } + return 0; +} + +/* + * Make isolated pages available again. + */ +void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, + int migratetype) +{ + unsigned long pfn; + struct page *page; + unsigned long isolate_start = pageblock_start_pfn(start_pfn); + unsigned long isolate_end = pageblock_align(end_pfn); + + for (pfn = isolate_start; + pfn < isolate_end; + pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (!page || !is_migrate_isolate_page(page)) + continue; + unset_migratetype_isolate(page, migratetype); + } +} +/* + * Test all pages in the range is free(means isolated) or not. + * all pages in [start_pfn...end_pfn) must be in the same zone. + * zone->lock must be held before call this. + * + * Returns the last tested pfn. + */ +static unsigned long +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, + int flags) +{ + struct page *page; + + while (pfn < end_pfn) { + page = pfn_to_page(pfn); + if (PageBuddy(page)) + /* + * If the page is on a free list, it has to be on + * the correct MIGRATE_ISOLATE freelist. There is no + * simple way to verify that as VM_BUG_ON(), though. + */ + pfn += 1 << buddy_order(page); + else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) + /* A HWPoisoned page cannot be also PageBuddy */ + pfn++; + else if ((flags & MEMORY_OFFLINE) && PageOffline(page) && + !page_count(page)) + /* + * The responsible driver agreed to skip PageOffline() + * pages when offlining memory by dropping its + * reference in MEM_GOING_OFFLINE. + */ + pfn++; + else + break; + } + + return pfn; +} + +/* Caller should ensure that requested range is in a single zone */ +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, + int isol_flags) +{ + unsigned long pfn, flags; + struct page *page; + struct zone *zone; + int ret; + + /* + * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages + * are not aligned to pageblock_nr_pages. + * Then we just check migratetype first. + */ + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + page = __first_valid_page(pfn, pageblock_nr_pages); + if (page && !is_migrate_isolate_page(page)) + break; + } + page = __first_valid_page(start_pfn, end_pfn - start_pfn); + if ((pfn < end_pfn) || !page) { + ret = -EBUSY; + goto out; + } + + /* Check all pages are free or marked as ISOLATED */ + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); + spin_unlock_irqrestore(&zone->lock, flags); + + ret = pfn < end_pfn ? -EBUSY : 0; + +out: + trace_test_pages_isolated(start_pfn, end_pfn, pfn); + + return ret; +} diff --git a/mm/page_owner.c b/mm/page_owner.c new file mode 100644 index 000000000..2d27f532d --- /dev/null +++ b/mm/page_owner.c @@ -0,0 +1,726 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) + * to use off stack temporal storage + */ +#define PAGE_OWNER_STACK_DEPTH (16) + +struct page_owner { + unsigned short order; + short last_migrate_reason; + gfp_t gfp_mask; + depot_stack_handle_t handle; + depot_stack_handle_t free_handle; + u64 ts_nsec; + u64 free_ts_nsec; + char comm[TASK_COMM_LEN]; + pid_t pid; + pid_t tgid; +}; + +static bool page_owner_enabled __initdata; +DEFINE_STATIC_KEY_FALSE(page_owner_inited); + +static depot_stack_handle_t dummy_handle; +static depot_stack_handle_t failure_handle; +static depot_stack_handle_t early_handle; + +static void init_early_allocated_pages(void); + +static int __init early_page_owner_param(char *buf) +{ + int ret = kstrtobool(buf, &page_owner_enabled); + + if (page_owner_enabled) + stack_depot_want_early_init(); + + return ret; +} +early_param("page_owner", early_page_owner_param); + +static __init bool need_page_owner(void) +{ + return page_owner_enabled; +} + +static __always_inline depot_stack_handle_t create_dummy_stack(void) +{ + unsigned long entries[4]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + return stack_depot_save(entries, nr_entries, GFP_KERNEL); +} + +static noinline void register_dummy_stack(void) +{ + dummy_handle = create_dummy_stack(); +} + +static noinline void register_failure_stack(void) +{ + failure_handle = create_dummy_stack(); +} + +static noinline void register_early_stack(void) +{ + early_handle = create_dummy_stack(); +} + +static __init void init_page_owner(void) +{ + if (!page_owner_enabled) + return; + + register_dummy_stack(); + register_failure_stack(); + register_early_stack(); + static_branch_enable(&page_owner_inited); + init_early_allocated_pages(); +} + +struct page_ext_operations page_owner_ops = { + .size = sizeof(struct page_owner), + .need = need_page_owner, + .init = init_page_owner, +}; + +static inline struct page_owner *get_page_owner(struct page_ext *page_ext) +{ + return (void *)page_ext + page_owner_ops.offset; +} + +static noinline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[PAGE_OWNER_STACK_DEPTH]; + depot_stack_handle_t handle; + unsigned int nr_entries; + + /* + * Avoid recursion. + * + * Sometimes page metadata allocation tracking requires more + * memory to be allocated: + * - when new stack trace is saved to stack depot + * - when backtrace itself is calculated (ia64) + */ + if (current->in_page_owner) + return dummy_handle; + current->in_page_owner = 1; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); + handle = stack_depot_save(entries, nr_entries, flags); + if (!handle) + handle = failure_handle; + + current->in_page_owner = 0; + return handle; +} + +void __reset_page_owner(struct page *page, unsigned short order) +{ + int i; + struct page_ext *page_ext; + depot_stack_handle_t handle; + struct page_owner *page_owner; + u64 free_ts_nsec = local_clock(); + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + return; + + handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + for (i = 0; i < (1 << order); i++) { + __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + page_owner = get_page_owner(page_ext); + page_owner->free_handle = handle; + page_owner->free_ts_nsec = free_ts_nsec; + page_ext = page_ext_next(page_ext); + } + page_ext_put(page_ext); +} + +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, + unsigned short order, gfp_t gfp_mask) +{ + struct page_owner *page_owner; + int i; + + for (i = 0; i < (1 << order); i++) { + page_owner = get_page_owner(page_ext); + page_owner->handle = handle; + page_owner->order = order; + page_owner->gfp_mask = gfp_mask; + page_owner->last_migrate_reason = -1; + page_owner->pid = current->pid; + page_owner->tgid = current->tgid; + page_owner->ts_nsec = local_clock(); + strscpy(page_owner->comm, current->comm, + sizeof(page_owner->comm)); + __set_bit(PAGE_EXT_OWNER, &page_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); + + page_ext = page_ext_next(page_ext); + } +} + +noinline void __set_page_owner(struct page *page, unsigned short order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext; + depot_stack_handle_t handle; + + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + return; + __set_page_owner_handle(page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); +} + +void __set_page_owner_migrate_reason(struct page *page, int reason) +{ + struct page_ext *page_ext = page_ext_get(page); + struct page_owner *page_owner; + + if (unlikely(!page_ext)) + return; + + page_owner = get_page_owner(page_ext); + page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); +} + +void __split_page_owner(struct page *page, unsigned int nr) +{ + int i; + struct page_ext *page_ext = page_ext_get(page); + struct page_owner *page_owner; + + if (unlikely(!page_ext)) + return; + + for (i = 0; i < nr; i++) { + page_owner = get_page_owner(page_ext); + page_owner->order = 0; + page_ext = page_ext_next(page_ext); + } + page_ext_put(page_ext); +} + +void __folio_copy_owner(struct folio *newfolio, struct folio *old) +{ + struct page_ext *old_ext; + struct page_ext *new_ext; + struct page_owner *old_page_owner, *new_page_owner; + + old_ext = page_ext_get(&old->page); + if (unlikely(!old_ext)) + return; + + new_ext = page_ext_get(&newfolio->page); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); + return; + } + + old_page_owner = get_page_owner(old_ext); + new_page_owner = get_page_owner(new_ext); + new_page_owner->order = old_page_owner->order; + new_page_owner->gfp_mask = old_page_owner->gfp_mask; + new_page_owner->last_migrate_reason = + old_page_owner->last_migrate_reason; + new_page_owner->handle = old_page_owner->handle; + new_page_owner->pid = old_page_owner->pid; + new_page_owner->tgid = old_page_owner->tgid; + new_page_owner->ts_nsec = old_page_owner->ts_nsec; + new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; + strcpy(new_page_owner->comm, old_page_owner->comm); + + /* + * We don't clear the bit on the old folio as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overall stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the old folio to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); + __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); +} + +void pagetypeinfo_showmixedcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + struct page *page; + struct page_ext *page_ext; + struct page_owner *page_owner; + unsigned long pfn, block_end_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long count[MIGRATE_TYPES] = { 0, }; + int pageblock_mt, page_mt; + int i; + + /* Scan block by block. First and last block may be incomplete */ + pfn = zone->zone_start_pfn; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + page = pfn_to_online_page(pfn); + if (!page) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = pageblock_end_pfn(pfn); + block_end_pfn = min(block_end_pfn, end_pfn); + + pageblock_mt = get_pageblock_migratetype(page); + + for (; pfn < block_end_pfn; pfn++) { + /* The pageblock is online, no need to recheck. */ + page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + + if (PageBuddy(page)) { + unsigned long freepage_order; + + freepage_order = buddy_order_unsafe(page); + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + continue; + + if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) + goto ext_put_continue; + + page_owner = get_page_owner(page_ext); + page_mt = gfp_migratetype(page_owner->gfp_mask); + if (pageblock_mt != page_mt) { + if (is_migrate_cma(pageblock_mt)) + count[MIGRATE_MOVABLE]++; + else + count[pageblock_mt]++; + + pfn = block_end_pfn; + page_ext_put(page_ext); + break; + } + pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); + } + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (i = 0; i < MIGRATE_TYPES; i++) + seq_printf(m, "%12lu ", count[i]); + seq_putc(m, '\n'); +} + +/* + * Looking for memcg information and print it out + */ +static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + struct page *page) +{ +#ifdef CONFIG_MEMCG + unsigned long memcg_data; + struct mem_cgroup *memcg; + bool online; + char name[80]; + + rcu_read_lock(); + memcg_data = READ_ONCE(page->memcg_data); + if (!memcg_data) + goto out_unlock; + + if (memcg_data & MEMCG_DATA_OBJCGS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + + memcg = page_memcg_check(page); + if (!memcg) + goto out_unlock; + + online = (memcg->css.flags & CSS_ONLINE); + cgroup_name(memcg->css.cgroup, name, sizeof(name)); + ret += scnprintf(kbuf + ret, count - ret, + "Charged %sto %smemcg %s\n", + PageMemcgKmem(page) ? "(via objcg) " : "", + online ? "" : "offline ", + name); +out_unlock: + rcu_read_unlock(); +#endif /* CONFIG_MEMCG */ + + return ret; +} + +static ssize_t +print_page_owner(char __user *buf, size_t count, unsigned long pfn, + struct page *page, struct page_owner *page_owner, + depot_stack_handle_t handle) +{ + int ret, pageblock_mt, page_mt; + char *kbuf; + + count = min_t(size_t, count, PAGE_SIZE); + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = scnprintf(kbuf, count, + "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", + page_owner->order, page_owner->gfp_mask, + &page_owner->gfp_mask, page_owner->pid, + page_owner->tgid, page_owner->comm, + page_owner->ts_nsec, page_owner->free_ts_nsec); + + /* Print information relevant to grouping pages by mobility */ + pageblock_mt = get_pageblock_migratetype(page); + page_mt = gfp_migratetype(page_owner->gfp_mask); + ret += scnprintf(kbuf + ret, count - ret, + "PFN %lu type %s Block %lu type %s Flags %pGp\n", + pfn, + migratetype_names[page_mt], + pfn >> pageblock_order, + migratetype_names[pageblock_mt], + &page->flags); + + ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0); + if (ret >= count) + goto err; + + if (page_owner->last_migrate_reason != -1) { + ret += scnprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_owner->last_migrate_reason]); + } + + ret = print_page_owner_memcg(kbuf, count, ret, page); + + ret += snprintf(kbuf + ret, count - ret, "\n"); + if (ret >= count) + goto err; + + if (copy_to_user(buf, kbuf, ret)) + ret = -EFAULT; + + kfree(kbuf); + return ret; + +err: + kfree(kbuf); + return -ENOMEM; +} + +void __dump_page_owner(const struct page *page) +{ + struct page_ext *page_ext = page_ext_get((void *)page); + struct page_owner *page_owner; + depot_stack_handle_t handle; + gfp_t gfp_mask; + int mt; + + if (unlikely(!page_ext)) { + pr_alert("There is not page extension available.\n"); + return; + } + + page_owner = get_page_owner(page_ext); + gfp_mask = page_owner->gfp_mask; + mt = gfp_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); + return; + } + + if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) + pr_alert("page_owner tracks the page as allocated\n"); + else + pr_alert("page_owner tracks the page as freed\n"); + + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, + page_owner->pid, page_owner->tgid, page_owner->comm, + page_owner->ts_nsec, page_owner->free_ts_nsec); + + handle = READ_ONCE(page_owner->handle); + if (!handle) + pr_alert("page_owner allocation stack trace missing\n"); + else + stack_depot_print(handle); + + handle = READ_ONCE(page_owner->free_handle); + if (!handle) { + pr_alert("page_owner free stack trace missing\n"); + } else { + pr_alert("page last free stack trace:\n"); + stack_depot_print(handle); + } + + if (page_owner->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); +} + +static ssize_t +read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long pfn; + struct page *page; + struct page_ext *page_ext; + struct page_owner *page_owner; + depot_stack_handle_t handle; + + if (!static_branch_unlikely(&page_owner_inited)) + return -EINVAL; + + page = NULL; + if (*ppos == 0) + pfn = min_low_pfn; + else + pfn = *ppos; + /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ + while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) + pfn++; + + /* Find an allocated page */ + for (; pfn < max_pfn; pfn++) { + /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + + /* + * If the new page is in a new MAX_ORDER_NR_PAGES area, + * validate the area as existing, skip it if not + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0 && !pfn_valid(pfn)) { + pfn += MAX_ORDER_NR_PAGES - 1; + continue; + } + + page = pfn_to_page(pfn); + if (PageBuddy(page)) { + unsigned long freepage_order = buddy_order_unsafe(page); + + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; + continue; + } + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + continue; + + /* + * Some pages could be missed by concurrent allocation or free, + * because we don't hold the zone lock. + */ + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + goto ext_put_continue; + + /* + * Although we do have the info about past allocation of free + * pages, it's not relevant for current memory usage. + */ + if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) + goto ext_put_continue; + + page_owner = get_page_owner(page_ext); + + /* + * Don't print "tail" pages of high-order allocations as that + * would inflate the stats. + */ + if (!IS_ALIGNED(pfn, 1 << page_owner->order)) + goto ext_put_continue; + + /* + * Access to page_ext->handle isn't synchronous so we should + * be careful to access it. + */ + handle = READ_ONCE(page_owner->handle); + if (!handle) + goto ext_put_continue; + + /* Record the next PFN to read in the file offset */ + *ppos = pfn + 1; + + page_owner_tmp = *page_owner; + page_ext_put(page_ext); + return print_page_owner(buf, count, pfn, page, + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); + } + + return 0; +} + +static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case SEEK_SET: + file->f_pos = offset; + break; + case SEEK_CUR: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + return file->f_pos; +} + +static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) +{ + unsigned long pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long count = 0; + + /* + * Walk the zone in pageblock_nr_pages steps. If a page block spans + * a zone boundary, it will be double counted between zones. This does + * not matter as the mixed block count will still be correct + */ + for (; pfn < end_pfn; ) { + unsigned long block_end_pfn; + + if (!pfn_valid(pfn)) { + pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); + continue; + } + + block_end_pfn = pageblock_end_pfn(pfn); + block_end_pfn = min(block_end_pfn, end_pfn); + + for (; pfn < block_end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + struct page_ext *page_ext; + + if (page_zone(page) != zone) + continue; + + /* + * To avoid having to grab zone->lock, be a little + * careful when reading buddy page order. The only + * danger is that we skip too much and potentially miss + * some early allocated pages, which is better than + * heavy lock contention. + */ + if (PageBuddy(page)) { + unsigned long order = buddy_order_unsafe(page); + + if (order > 0 && order < MAX_ORDER) + pfn += (1UL << order) - 1; + continue; + } + + if (PageReserved(page)) + continue; + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + continue; + + /* Maybe overlapping zone */ + if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) + goto ext_put_continue; + + /* Found early allocated page */ + __set_page_owner_handle(page_ext, early_handle, + 0, 0); + count++; +ext_put_continue: + page_ext_put(page_ext); + } + cond_resched(); + } + + pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", + pgdat->node_id, zone->name, count); +} + +static void init_zones_in_node(pg_data_t *pgdat) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + + init_pages_in_zone(pgdat, zone); + } +} + +static void init_early_allocated_pages(void) +{ + pg_data_t *pgdat; + + for_each_online_pgdat(pgdat) + init_zones_in_node(pgdat); +} + +static const struct file_operations proc_page_owner_operations = { + .read = read_page_owner, + .llseek = lseek_page_owner, +}; + +static int __init pageowner_init(void) +{ + if (!static_branch_unlikely(&page_owner_inited)) { + pr_info("page_owner is disabled\n"); + return 0; + } + + debugfs_create_file("page_owner", 0400, NULL, NULL, + &proc_page_owner_operations); + + return 0; +} +late_initcall(pageowner_init) diff --git a/mm/page_poison.c b/mm/page_poison.c new file mode 100644 index 000000000..98438985e --- /dev/null +++ b/mm/page_poison.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +bool _page_poisoning_enabled_early; +EXPORT_SYMBOL(_page_poisoning_enabled_early); +DEFINE_STATIC_KEY_FALSE(_page_poisoning_enabled); +EXPORT_SYMBOL(_page_poisoning_enabled); + +static int __init early_page_poison_param(char *buf) +{ + return kstrtobool(buf, &_page_poisoning_enabled_early); +} +early_param("page_poison", early_page_poison_param); + +static void poison_page(struct page *page) +{ + void *addr = kmap_atomic(page); + + /* KASAN still think the page is in-use, so skip it. */ + kasan_disable_current(); + memset(kasan_reset_tag(addr), PAGE_POISON, PAGE_SIZE); + kasan_enable_current(); + kunmap_atomic(addr); +} + +void __kernel_poison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + poison_page(page + i); +} + +static bool single_bit_flip(unsigned char a, unsigned char b) +{ + unsigned char error = a ^ b; + + return error && !(error & (error - 1)); +} + +static void check_poison_mem(struct page *page, unsigned char *mem, size_t bytes) +{ + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); + unsigned char *start; + unsigned char *end; + + start = memchr_inv(mem, PAGE_POISON, bytes); + if (!start) + return; + + for (end = mem + bytes - 1; end > start; end--) { + if (*end != PAGE_POISON) + break; + } + + if (!__ratelimit(&ratelimit)) + return; + else if (start == end && single_bit_flip(*start, PAGE_POISON)) + pr_err("pagealloc: single bit error\n"); + else + pr_err("pagealloc: memory corruption\n"); + + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, + end - start + 1, 1); + dump_stack(); + dump_page(page, "pagealloc: corrupted page details"); +} + +static void unpoison_page(struct page *page) +{ + void *addr; + + addr = kmap_atomic(page); + kasan_disable_current(); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was poisoned. + */ + check_poison_mem(page, kasan_reset_tag(addr), PAGE_SIZE); + kasan_enable_current(); + kunmap_atomic(addr); +} + +void __kernel_unpoison_pages(struct page *page, int n) +{ + int i; + + for (i = 0; i < n; i++) + unpoison_page(page + i); +} + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif diff --git a/mm/page_reporting.c b/mm/page_reporting.c new file mode 100644 index 000000000..382958eef --- /dev/null +++ b/mm/page_reporting.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include "page_reporting.h" +#include "internal.h" + +unsigned int page_reporting_order = MAX_ORDER; +module_param(page_reporting_order, uint, 0644); +MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); + +#define PAGE_REPORTING_DELAY (2 * HZ) +static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; + +enum { + PAGE_REPORTING_IDLE = 0, + PAGE_REPORTING_REQUESTED, + PAGE_REPORTING_ACTIVE +}; + +/* request page reporting */ +static void +__page_reporting_request(struct page_reporting_dev_info *prdev) +{ + unsigned int state; + + /* Check to see if we are in desired state */ + state = atomic_read(&prdev->state); + if (state == PAGE_REPORTING_REQUESTED) + return; + + /* + * If reporting is already active there is nothing we need to do. + * Test against 0 as that represents PAGE_REPORTING_IDLE. + */ + state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); + if (state != PAGE_REPORTING_IDLE) + return; + + /* + * Delay the start of work to allow a sizable queue to build. For + * now we are limiting this to running no more than once every + * couple of seconds. + */ + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +/* notify prdev of free page reporting request */ +void __page_reporting_notify(void) +{ + struct page_reporting_dev_info *prdev; + + /* + * We use RCU to protect the pr_dev_info pointer. In almost all + * cases this should be present, however in the unlikely case of + * a shutdown this will be NULL and we should exit. + */ + rcu_read_lock(); + prdev = rcu_dereference(pr_dev_info); + if (likely(prdev)) + __page_reporting_request(prdev); + + rcu_read_unlock(); +} + +static void +page_reporting_drain(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, unsigned int nents, bool reported) +{ + struct scatterlist *sg = sgl; + + /* + * Drain the now reported pages back into their respective + * free lists/areas. We assume at least one page is populated. + */ + do { + struct page *page = sg_page(sg); + int mt = get_pageblock_migratetype(page); + unsigned int order = get_order(sg->length); + + __putback_isolated_page(page, order, mt); + + /* If the pages were not reported due to error skip flagging */ + if (!reported) + continue; + + /* + * If page was not comingled with another page we can + * consider the result to be "reported" since the page + * hasn't been modified, otherwise we will need to + * report on the new larger page when we make our way + * up to that higher order. + */ + if (PageBuddy(page) && buddy_order(page) == order) + __SetPageReported(page); + } while ((sg = sg_next(sg))); + + /* reinitialize scatterlist now that it is empty */ + sg_init_table(sgl, nents); +} + +/* + * The page reporting cycle consists of 4 stages, fill, report, drain, and + * idle. We will cycle through the first 3 stages until we cannot obtain a + * full scatterlist of pages, in that case we will switch to idle. + */ +static int +page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, + unsigned int order, unsigned int mt, + struct scatterlist *sgl, unsigned int *offset) +{ + struct free_area *area = &zone->free_area[order]; + struct list_head *list = &area->free_list[mt]; + unsigned int page_len = PAGE_SIZE << order; + struct page *page, *next; + long budget; + int err = 0; + + /* + * Perform early check, if free area is empty there is + * nothing to process so we can skip this free_list. + */ + if (list_empty(list)) + return err; + + spin_lock_irq(&zone->lock); + + /* + * Limit how many calls we will be making to the page reporting + * device for this list. By doing this we avoid processing any + * given list for too long. + * + * The current value used allows us enough calls to process over a + * sixteenth of the current list plus one additional call to handle + * any pages that may have already been present from the previous + * list processed. This should result in us reporting all pages on + * an idle system in about 30 seconds. + * + * The division here should be cheap since PAGE_REPORTING_CAPACITY + * should always be a power of 2. + */ + budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); + + /* loop through free list adding unreported pages to sg list */ + list_for_each_entry_safe(page, next, list, lru) { + /* We are going to skip over the reported pages. */ + if (PageReported(page)) + continue; + + /* + * If we fully consumed our budget then update our + * state to indicate that we are requesting additional + * processing and exit this list. + */ + if (budget < 0) { + atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); + next = page; + break; + } + + /* Attempt to pull page from list and place in scatterlist */ + if (*offset) { + if (!__isolate_free_page(page, order)) { + next = page; + break; + } + + /* Add page to scatter list */ + --(*offset); + sg_set_page(&sgl[*offset], page, page_len, 0); + + continue; + } + + /* + * Make the first non-reported page in the free list + * the new head of the free list before we release the + * zone lock. + */ + if (!list_is_first(&page->lru, list)) + list_rotate_to_front(&page->lru, list); + + /* release lock before waiting on report processing */ + spin_unlock_irq(&zone->lock); + + /* begin processing pages in local list */ + err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); + + /* reset offset since the full list was reported */ + *offset = PAGE_REPORTING_CAPACITY; + + /* update budget to reflect call to report function */ + budget--; + + /* reacquire zone lock and resume processing */ + spin_lock_irq(&zone->lock); + + /* flush reported pages from the sg list */ + page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); + + /* + * Reset next to first entry, the old next isn't valid + * since we dropped the lock to report the pages + */ + next = list_first_entry(list, struct page, lru); + + /* exit on error */ + if (err) + break; + } + + /* Rotate any leftover pages to the head of the freelist */ + if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) + list_rotate_to_front(&next->lru, list); + + spin_unlock_irq(&zone->lock); + + return err; +} + +static int +page_reporting_process_zone(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, struct zone *zone) +{ + unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; + unsigned long watermark; + int err = 0; + + /* Generate minimum watermark to be able to guarantee progress */ + watermark = low_wmark_pages(zone) + + (PAGE_REPORTING_CAPACITY << page_reporting_order); + + /* + * Cancel request if insufficient free memory or if we failed + * to allocate page reporting statistics for the zone. + */ + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return err; + + /* Process each free list starting from lowest order/mt */ + for (order = page_reporting_order; order < MAX_ORDER; order++) { + for (mt = 0; mt < MIGRATE_TYPES; mt++) { + /* We do not pull pages from the isolate free list */ + if (is_migrate_isolate(mt)) + continue; + + err = page_reporting_cycle(prdev, zone, order, mt, + sgl, &offset); + if (err) + return err; + } + } + + /* report the leftover pages before going idle */ + leftover = PAGE_REPORTING_CAPACITY - offset; + if (leftover) { + sgl = &sgl[offset]; + err = prdev->report(prdev, sgl, leftover); + + /* flush any remaining pages out from the last report */ + spin_lock_irq(&zone->lock); + page_reporting_drain(prdev, sgl, leftover, !err); + spin_unlock_irq(&zone->lock); + } + + return err; +} + +static void page_reporting_process(struct work_struct *work) +{ + struct delayed_work *d_work = to_delayed_work(work); + struct page_reporting_dev_info *prdev = + container_of(d_work, struct page_reporting_dev_info, work); + int err = 0, state = PAGE_REPORTING_ACTIVE; + struct scatterlist *sgl; + struct zone *zone; + + /* + * Change the state to "Active" so that we can track if there is + * anyone requests page reporting after we complete our pass. If + * the state is not altered by the end of the pass we will switch + * to idle and quit scheduling reporting runs. + */ + atomic_set(&prdev->state, state); + + /* allocate scatterlist to store pages being reported on */ + sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); + if (!sgl) + goto err_out; + + sg_init_table(sgl, PAGE_REPORTING_CAPACITY); + + for_each_zone(zone) { + err = page_reporting_process_zone(prdev, sgl, zone); + if (err) + break; + } + + kfree(sgl); +err_out: + /* + * If the state has reverted back to requested then there may be + * additional pages to be processed. We will defer for 2s to allow + * more pages to accumulate. + */ + state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); + if (state == PAGE_REPORTING_REQUESTED) + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +static DEFINE_MUTEX(page_reporting_mutex); +DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); + +int page_reporting_register(struct page_reporting_dev_info *prdev) +{ + int err = 0; + + mutex_lock(&page_reporting_mutex); + + /* nothing to do if already in use */ + if (rcu_access_pointer(pr_dev_info)) { + err = -EBUSY; + goto err_out; + } + + /* + * Update the page reporting order if it's specified by driver. + * Otherwise, it falls back to @pageblock_order. + */ + page_reporting_order = prdev->order ? : pageblock_order; + + /* initialize state and work structures */ + atomic_set(&prdev->state, PAGE_REPORTING_IDLE); + INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); + + /* Begin initial flush of zones */ + __page_reporting_request(prdev); + + /* Assign device to allow notifications */ + rcu_assign_pointer(pr_dev_info, prdev); + + /* enable page reporting notification */ + if (!static_key_enabled(&page_reporting_enabled)) { + static_branch_enable(&page_reporting_enabled); + pr_info("Free page reporting enabled\n"); + } +err_out: + mutex_unlock(&page_reporting_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(page_reporting_register); + +void page_reporting_unregister(struct page_reporting_dev_info *prdev) +{ + mutex_lock(&page_reporting_mutex); + + if (rcu_access_pointer(pr_dev_info) == prdev) { + /* Disable page reporting notification */ + RCU_INIT_POINTER(pr_dev_info, NULL); + synchronize_rcu(); + + /* Flush any existing work, and lock it out */ + cancel_delayed_work_sync(&prdev->work); + } + + mutex_unlock(&page_reporting_mutex); +} +EXPORT_SYMBOL_GPL(page_reporting_unregister); diff --git a/mm/page_reporting.h b/mm/page_reporting.h new file mode 100644 index 000000000..c51dbc228 --- /dev/null +++ b/mm/page_reporting.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_PAGE_REPORTING_H +#define _MM_PAGE_REPORTING_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PAGE_REPORTING +DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); +extern unsigned int page_reporting_order; +void __page_reporting_notify(void); + +static inline bool page_reported(struct page *page) +{ + return static_branch_unlikely(&page_reporting_enabled) && + PageReported(page); +} + +/** + * page_reporting_notify_free - Free page notification to start page processing + * + * This function is meant to act as a screener for __page_reporting_notify + * which will determine if a give zone has crossed over the high-water mark + * that will justify us beginning page treatment. If we have crossed that + * threshold then it will start the process of pulling some pages and + * placing them in the batch list for treatment. + */ +static inline void page_reporting_notify_free(unsigned int order) +{ + /* Called from hot path in __free_one_page() */ + if (!static_branch_unlikely(&page_reporting_enabled)) + return; + + /* Determine if we have crossed reporting threshold */ + if (order < page_reporting_order) + return; + + /* This will add a few cycles, but should be called infrequently */ + __page_reporting_notify(); +} +#else /* CONFIG_PAGE_REPORTING */ +#define page_reported(_page) false + +static inline void page_reporting_notify_free(unsigned int order) +{ +} +#endif /* CONFIG_PAGE_REPORTING */ +#endif /*_MM_PAGE_REPORTING_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c new file mode 100644 index 000000000..4d0506537 --- /dev/null +++ b/mm/page_table_check.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "page_table_check: " fmt + +struct page_table_check { + atomic_t anon_map_count; + atomic_t file_map_count; +}; + +static bool __page_table_check_enabled __initdata = + IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); + +DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); +EXPORT_SYMBOL(page_table_check_disabled); + +static int __init early_page_table_check_param(char *buf) +{ + return strtobool(buf, &__page_table_check_enabled); +} + +early_param("page_table_check", early_page_table_check_param); + +static bool __init need_page_table_check(void) +{ + return __page_table_check_enabled; +} + +static void __init init_page_table_check(void) +{ + if (!__page_table_check_enabled) + return; + static_branch_disable(&page_table_check_disabled); +} + +struct page_ext_operations page_table_check_ops = { + .size = sizeof(struct page_table_check), + .need = need_page_table_check, + .init = init_page_table_check, +}; + +static struct page_table_check *get_page_table_check(struct page_ext *page_ext) +{ + BUG_ON(!page_ext); + return (void *)(page_ext) + page_table_check_ops.offset; +} + +/* + * An entry is removed from the page table, decrement the counters for that page + * verify that it is of correct type and counters do not become negative. + */ +static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt) +{ + struct page_ext *page_ext; + struct page *page; + unsigned long i; + bool anon; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = page_ext_get(page); + + BUG_ON(PageSlab(page)); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } + page_ext_put(page_ext); +} + +/* + * A new entry is added to the page table, increment the counters for that page + * verify that it is of correct type and is not being mapped with a different + * type to a different process. + */ +static void page_table_check_set(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt, + bool rw) +{ + struct page_ext *page_ext; + struct page *page; + unsigned long i; + bool anon; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = page_ext_get(page); + + BUG_ON(PageSlab(page)); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } + page_ext_put(page_ext); +} + +/* + * page is on free list, or is being allocated, verify that counters are zeroes + * crash if they are not. + */ +void __page_table_check_zero(struct page *page, unsigned int order) +{ + struct page_ext *page_ext; + unsigned long i; + + BUG_ON(PageSlab(page)); + + page_ext = page_ext_get(page); + BUG_ON(!page_ext); + for (i = 0; i < (1ul << order); i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_read(&ptc->file_map_count)); + page_ext = page_ext_next(page_ext); + } + page_ext_put(page_ext); +} + +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) +{ + if (&init_mm == mm) + return; + + if (pte_user_accessible_page(pte)) { + page_table_check_clear(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pte_clear); + +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (pmd_user_accessible_page(pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(pmd), + PMD_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_clear); + +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) +{ + if (&init_mm == mm) + return; + + if (pud_user_accessible_page(pud)) { + page_table_check_clear(mm, addr, pud_pfn(pud), + PUD_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pud_clear); + +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + if (&init_mm == mm) + return; + + __page_table_check_pte_clear(mm, addr, *ptep); + if (pte_user_accessible_page(pte)) { + page_table_check_set(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT, + pte_write(pte)); + } +} +EXPORT_SYMBOL(__page_table_check_pte_set); + +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + if (&init_mm == mm) + return; + + __page_table_check_pmd_clear(mm, addr, *pmdp); + if (pmd_user_accessible_page(pmd)) { + page_table_check_set(mm, addr, pmd_pfn(pmd), + PMD_SIZE >> PAGE_SHIFT, + pmd_write(pmd)); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_set); + +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + if (&init_mm == mm) + return; + + __page_table_check_pud_clear(mm, addr, *pudp); + if (pud_user_accessible_page(pud)) { + page_table_check_set(mm, addr, pud_pfn(pud), + PUD_SIZE >> PAGE_SHIFT, + pud_write(pud)); + } +} +EXPORT_SYMBOL(__page_table_check_pud_set); + +void __page_table_check_pte_clear_range(struct mm_struct *mm, + unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (!pmd_bad(pmd) && !pmd_leaf(pmd)) { + pte_t *ptep = pte_offset_map(&pmd, addr); + unsigned long i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + __page_table_check_pte_clear(mm, addr, *ptep); + addr += PAGE_SIZE; + ptep++; + } + pte_unmap(ptep - PTRS_PER_PTE); + } +} diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c new file mode 100644 index 000000000..93e13fc17 --- /dev/null +++ b/mm/page_vma_mapped.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "internal.h" + +static inline bool not_found(struct page_vma_mapped_walk *pvmw) +{ + page_vma_mapped_walk_done(pvmw); + return false; +} + +static bool map_pte(struct page_vma_mapped_walk *pvmw) +{ + pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address); + if (!(pvmw->flags & PVMW_SYNC)) { + if (pvmw->flags & PVMW_MIGRATION) { + if (!is_swap_pte(*pvmw->pte)) + return false; + } else { + /* + * We get here when we are trying to unmap a private + * device page from the process address space. Such + * page is not CPU accessible and thus is mapped as + * a special swap entry, nonetheless it still does + * count as a valid regular mapping for the page (and + * is accounted as such in page maps count). + * + * So handle this special case as if it was a normal + * page mapping ie lock CPU page table and returns + * true. + * + * For more details on device private memory see HMM + * (include/linux/hmm.h or mm/hmm.c). + */ + if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + /* Handle un-addressable ZONE_DEVICE memory */ + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_device_private_entry(entry) && + !is_device_exclusive_entry(entry)) + return false; + } else if (!pte_present(*pvmw->pte)) + return false; + } + } + pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd); + spin_lock(pvmw->ptl); + return true; +} + +/** + * check_pte - check if @pvmw->page is mapped at the @pvmw->pte + * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking + * + * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* + * mapped. check_pte() has to validate this. + * + * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to + * arbitrary page. + * + * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration + * entry that points to @pvmw->page or any subpage in case of THP. + * + * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to + * pvmw->page or any subpage in case of THP. + * + * Otherwise, return false. + * + */ +static bool check_pte(struct page_vma_mapped_walk *pvmw) +{ + unsigned long pfn; + + if (pvmw->flags & PVMW_MIGRATION) { + swp_entry_t entry; + if (!is_swap_pte(*pvmw->pte)) + return false; + entry = pte_to_swp_entry(*pvmw->pte); + + if (!is_migration_entry(entry) && + !is_device_exclusive_entry(entry)) + return false; + + pfn = swp_offset_pfn(entry); + } else if (is_swap_pte(*pvmw->pte)) { + swp_entry_t entry; + + /* Handle un-addressable ZONE_DEVICE memory */ + entry = pte_to_swp_entry(*pvmw->pte); + if (!is_device_private_entry(entry) && + !is_device_exclusive_entry(entry)) + return false; + + pfn = swp_offset_pfn(entry); + } else { + if (!pte_present(*pvmw->pte)) + return false; + + pfn = pte_pfn(*pvmw->pte); + } + + return (pfn - pvmw->pfn) < pvmw->nr_pages; +} + +/* Returns true if the two ranges overlap. Careful to not overflow. */ +static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw) +{ + if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn) + return false; + if (pfn > pvmw->pfn + pvmw->nr_pages - 1) + return false; + return true; +} + +static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) +{ + pvmw->address = (pvmw->address + size) & ~(size - 1); + if (!pvmw->address) + pvmw->address = ULONG_MAX; +} + +/** + * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at + * @pvmw->address + * @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags + * must be set. pmd, pte and ptl must be NULL. + * + * Returns true if the page is mapped in the vma. @pvmw->pmd and @pvmw->pte point + * to relevant page table entries. @pvmw->ptl is locked. @pvmw->address is + * adjusted if needed (for PTE-mapped THPs). + * + * If @pvmw->pmd is set but @pvmw->pte is not, you have found PMD-mapped page + * (usually THP). For PTE-mapped THP, you should run page_vma_mapped_walk() in + * a loop to find all PTEs that map the THP. + * + * For HugeTLB pages, @pvmw->pte is set to the relevant page table entry + * regardless of which page table level the page is mapped at. @pvmw->pmd is + * NULL. + * + * Returns false if there are no more page table entries for the page in + * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. + * + * If you need to stop the walk before page_vma_mapped_walk() returned false, + * use page_vma_mapped_walk_done(). It will do the housekeeping. + */ +bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) +{ + struct vm_area_struct *vma = pvmw->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long end; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t pmde; + + /* The only possible pmd mapping has been handled on last iteration */ + if (pvmw->pmd && !pvmw->pte) + return not_found(pvmw); + + if (unlikely(is_vm_hugetlb_page(vma))) { + struct hstate *hstate = hstate_vma(vma); + unsigned long size = huge_page_size(hstate); + /* The only possible mapping was handled on last iteration */ + if (pvmw->pte) + return not_found(pvmw); + + /* when pud is not present, pte will be NULL */ + pvmw->pte = huge_pte_offset(mm, pvmw->address, size); + if (!pvmw->pte) + return false; + + pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte); + if (!check_pte(pvmw)) + return not_found(pvmw); + return true; + } + + end = vma_address_end(pvmw); + if (pvmw->pte) + goto next_pte; +restart: + do { + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) { + step_forward(pvmw, PGDIR_SIZE); + continue; + } + p4d = p4d_offset(pgd, pvmw->address); + if (!p4d_present(*p4d)) { + step_forward(pvmw, P4D_SIZE); + continue; + } + pud = pud_offset(p4d, pvmw->address); + if (!pud_present(*pud)) { + step_forward(pvmw, PUD_SIZE); + continue; + } + + pvmw->pmd = pmd_offset(pud, pvmw->address); + /* + * Make sure the pmd value isn't cached in a register by the + * compiler and used as a stale value after we've observed a + * subsequent update. + */ + pmde = READ_ONCE(*pvmw->pmd); + + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) || + (pmd_present(pmde) && pmd_devmap(pmde))) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + pmde = *pvmw->pmd; + if (!pmd_present(pmde)) { + swp_entry_t entry; + + if (!thp_migration_supported() || + !(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + entry = pmd_to_swp_entry(pmde); + if (!is_migration_entry(entry) || + !check_pmd(swp_offset_pfn(entry), pvmw)) + return not_found(pvmw); + return true; + } + if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) { + if (pvmw->flags & PVMW_MIGRATION) + return not_found(pvmw); + if (!check_pmd(pmd_pfn(pmde), pvmw)) + return not_found(pvmw); + return true; + } + /* THP pmd was split under us: handle on pte level */ + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + transhuge_vma_suitable(vma, pvmw->address) && + (pvmw->nr_pages >= HPAGE_PMD_NR)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + + spin_unlock(ptl); + } + step_forward(pvmw, PMD_SIZE); + continue; + } + if (!map_pte(pvmw)) + goto next_pte; +this_pte: + if (check_pte(pvmw)) + return true; +next_pte: + do { + pvmw->address += PAGE_SIZE; + if (pvmw->address >= end) + return not_found(pvmw); + /* Did we cross page table boundary? */ + if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { + if (pvmw->ptl) { + spin_unlock(pvmw->ptl); + pvmw->ptl = NULL; + } + pte_unmap(pvmw->pte); + pvmw->pte = NULL; + goto restart; + } + pvmw->pte++; + if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } + } while (pte_none(*pvmw->pte)); + + if (!pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); + } + goto this_pte; + } while (pvmw->address < end); + + return false; +} + +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + struct page_vma_mapped_walk pvmw = { + .pfn = page_to_pfn(page), + .nr_pages = 1, + .vma = vma, + .flags = PVMW_SYNC, + }; + + pvmw.address = vma_address(page, vma); + if (pvmw.address == -EFAULT) + return 0; + if (!page_vma_mapped_walk(&pvmw)) + return 0; + page_vma_mapped_walk_done(&pvmw); + return 1; +} diff --git a/mm/pagewalk.c b/mm/pagewalk.c new file mode 100644 index 000000000..2ff3a5beb --- /dev/null +++ b/mm/pagewalk.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +/* + * We want to know the real level where a entry is located ignoring any + * folding of levels which may be happening. For example if p4d is folded then + * a missing entry found at level 1 (p4d) is actually at level 0 (pgd). + */ +static int real_depth(int depth) +{ + if (depth == 3 && PTRS_PER_PMD == 1) + depth = 2; + if (depth == 2 && PTRS_PER_PUD == 1) + depth = 1; + if (depth == 1 && PTRS_PER_P4D == 1) + depth = 0; + return depth; +} + +static int walk_pte_range_inner(pte_t *pte, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + const struct mm_walk_ops *ops = walk->ops; + int err = 0; + + for (;;) { + err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (err) + break; + if (addr >= end - PAGE_SIZE) + break; + addr += PAGE_SIZE; + pte++; + } + return err; +} + +static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pte_t *pte; + int err = 0; + spinlock_t *ptl; + + if (walk->no_vma) { + pte = pte_offset_map(pmd, addr); + err = walk_pte_range_inner(pte, addr, end, walk); + pte_unmap(pte); + } else { + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + err = walk_pte_range_inner(pte, addr, end, walk); + pte_unmap_unlock(pte, ptl); + } + + return err; +} + +#ifdef CONFIG_ARCH_HAS_HUGEPD +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, + unsigned long end, struct mm_walk *walk, int pdshift) +{ + int err = 0; + const struct mm_walk_ops *ops = walk->ops; + int shift = hugepd_shift(*phpd); + int page_size = 1 << shift; + + if (!ops->pte_entry) + return 0; + + if (addr & (page_size - 1)) + return 0; + + for (;;) { + pte_t *pte; + + spin_lock(&walk->mm->page_table_lock); + pte = hugepte_offset(*phpd, addr, pdshift); + err = ops->pte_entry(pte, addr, addr + page_size, walk); + spin_unlock(&walk->mm->page_table_lock); + + if (err) + break; + if (addr >= end - page_size) + break; + addr += page_size; + } + return err; +} +#else +static int walk_hugepd_range(hugepd_t *phpd, unsigned long addr, + unsigned long end, struct mm_walk *walk, int pdshift) +{ + return 0; +} +#endif + +static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + const struct mm_walk_ops *ops = walk->ops; + int err = 0; + int depth = real_depth(3); + + pmd = pmd_offset(pud, addr); + do { +again: + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd)) { + if (ops->pte_hole) + err = ops->pte_hole(addr, next, depth, walk); + if (err) + break; + continue; + } + + walk->action = ACTION_SUBTREE; + + /* + * This implies that each ->pmd_entry() handler + * needs to know about pmd_trans_huge() pmds + */ + if (ops->pmd_entry) + err = ops->pmd_entry(pmd, addr, next, walk); + if (err) + break; + + if (walk->action == ACTION_AGAIN) + goto again; + + /* + * Check this here so we only break down trans_huge + * pages when we _need_ to + */ + if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || + walk->action == ACTION_CONTINUE || + !(ops->pte_entry)) + continue; + + if (walk->vma) { + split_huge_pmd(walk->vma, pmd, addr); + if (pmd_trans_unstable(pmd)) + goto again; + } + + if (is_hugepd(__hugepd(pmd_val(*pmd)))) + err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT); + else + err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pud_t *pud; + unsigned long next; + const struct mm_walk_ops *ops = walk->ops; + int err = 0; + int depth = real_depth(2); + + pud = pud_offset(p4d, addr); + do { + again: + next = pud_addr_end(addr, end); + if (pud_none(*pud)) { + if (ops->pte_hole) + err = ops->pte_hole(addr, next, depth, walk); + if (err) + break; + continue; + } + + walk->action = ACTION_SUBTREE; + + if (ops->pud_entry) + err = ops->pud_entry(pud, addr, next, walk); + if (err) + break; + + if (walk->action == ACTION_AGAIN) + goto again; + + if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || + walk->action == ACTION_CONTINUE || + !(ops->pmd_entry || ops->pte_entry)) + continue; + + if (walk->vma) + split_huge_pud(walk->vma, pud, addr); + if (pud_none(*pud)) + goto again; + + if (is_hugepd(__hugepd(pud_val(*pud)))) + err = walk_hugepd_range((hugepd_t *)pud, addr, next, walk, PUD_SHIFT); + else + err = walk_pmd_range(pud, addr, next, walk); + if (err) + break; + } while (pud++, addr = next, addr != end); + + return err; +} + +static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + const struct mm_walk_ops *ops = walk->ops; + int err = 0; + int depth = real_depth(1); + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) { + if (ops->pte_hole) + err = ops->pte_hole(addr, next, depth, walk); + if (err) + break; + continue; + } + if (ops->p4d_entry) { + err = ops->p4d_entry(p4d, addr, next, walk); + if (err) + break; + } + if (is_hugepd(__hugepd(p4d_val(*p4d)))) + err = walk_hugepd_range((hugepd_t *)p4d, addr, next, walk, P4D_SHIFT); + else if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) + err = walk_pud_range(p4d, addr, next, walk); + if (err) + break; + } while (p4d++, addr = next, addr != end); + + return err; +} + +static int walk_pgd_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + pgd_t *pgd; + unsigned long next; + const struct mm_walk_ops *ops = walk->ops; + int err = 0; + + if (walk->pgd) + pgd = walk->pgd + pgd_index(addr); + else + pgd = pgd_offset(walk->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) { + if (ops->pte_hole) + err = ops->pte_hole(addr, next, 0, walk); + if (err) + break; + continue; + } + if (ops->pgd_entry) { + err = ops->pgd_entry(pgd, addr, next, walk); + if (err) + break; + } + if (is_hugepd(__hugepd(pgd_val(*pgd)))) + err = walk_hugepd_range((hugepd_t *)pgd, addr, next, walk, PGDIR_SHIFT); + else if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) + err = walk_p4d_range(pgd, addr, next, walk); + if (err) + break; + } while (pgd++, addr = next, addr != end); + + return err; +} + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); + pte_t *pte; + const struct mm_walk_ops *ops = walk->ops; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = huge_pte_offset(walk->mm, addr & hmask, sz); + + if (pte) + err = ops->hugetlb_entry(pte, hmask, addr, next, walk); + else if (ops->pte_hole) + err = ops->pte_hole(addr, next, -1, walk); + + if (err) + break; + } while (addr = next, addr != end); + + return err; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static int walk_hugetlb_range(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Decide whether we really walk over the current vma on [@start, @end) + * or skip it via the returned value. Return 0 if we do walk over the + * current vma, and return 1 if we skip the vma. Negative values means + * error, where we abort the current walk. + */ +static int walk_page_test(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + const struct mm_walk_ops *ops = walk->ops; + + if (ops->test_walk) + return ops->test_walk(start, end, walk); + + /* + * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP + * range, so we don't walk over it as we do for normal vmas. However, + * Some callers are interested in handling hole range and they don't + * want to just ignore any single address range. Such users certainly + * define their ->pte_hole() callbacks, so let's delegate them to handle + * vma(VM_PFNMAP). + */ + if (vma->vm_flags & VM_PFNMAP) { + int err = 1; + if (ops->pte_hole) + err = ops->pte_hole(start, end, -1, walk); + return err ? err : 1; + } + return 0; +} + +static int __walk_page_range(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + int err = 0; + struct vm_area_struct *vma = walk->vma; + const struct mm_walk_ops *ops = walk->ops; + + if (ops->pre_vma) { + err = ops->pre_vma(start, end, walk); + if (err) + return err; + } + + if (is_vm_hugetlb_page(vma)) { + if (ops->hugetlb_entry) + err = walk_hugetlb_range(start, end, walk); + } else + err = walk_pgd_range(start, end, walk); + + if (ops->post_vma) + ops->post_vma(walk); + + return err; +} + +/** + * walk_page_range - walk page table with caller specific callbacks + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @private: private data for callbacks' usage + * + * Recursively walk the page table tree of the process represented by @mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. + * + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @ops->test_walk() are used for this + * purpose. + * + * If operations need to be staged before and committed after a vma is walked, + * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), + * since it is intended to handle commit-type operations, can't return any + * errors. + * + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @private should be helpful. + * + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, + * because these function traverse vma list and/or access to vma's data. + */ +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + int err = 0; + unsigned long next; + struct vm_area_struct *vma; + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .private = private, + }; + + if (start >= end) + return -EINVAL; + + if (!walk.mm) + return -EINVAL; + + mmap_assert_locked(walk.mm); + + vma = find_vma(walk.mm, start); + do { + if (!vma) { /* after the last vma */ + walk.vma = NULL; + next = end; + if (ops->pte_hole) + err = ops->pte_hole(start, next, -1, &walk); + } else if (start < vma->vm_start) { /* outside vma */ + walk.vma = NULL; + next = min(end, vma->vm_start); + if (ops->pte_hole) + err = ops->pte_hole(start, next, -1, &walk); + } else { /* inside vma */ + walk.vma = vma; + next = min(end, vma->vm_end); + vma = find_vma(mm, vma->vm_end); + + err = walk_page_test(start, next, &walk); + if (err > 0) { + /* + * positive return values are purely for + * controlling the pagewalk, so should never + * be passed to the callers. + */ + err = 0; + continue; + } + if (err < 0) + break; + err = __walk_page_range(start, next, &walk); + } + if (err) + break; + } while (start = next, start < end); + return err; +} + +/** + * walk_page_range_novma - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * + * Similar to walk_page_range() but can walk any page tables even if they are + * not backed by VMAs. Because 'unusual' entries may be walked this function + * will also not lock the PTEs for the pte_entry() callback. This is useful for + * walking the kernel pages tables or page tables for firmware. + */ +int walk_page_range_novma(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .pgd = pgd, + .private = private, + .no_vma = true + }; + + if (start >= end || !walk.mm) + return -EINVAL; + + mmap_assert_write_locked(walk.mm); + + return walk_pgd_range(start, end, &walk); +} + +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + int err; + + if (!walk.mm) + return -EINVAL; + + mmap_assert_locked(walk.mm); + + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(vma->vm_start, vma->vm_end, &walk); +} + +/** + * walk_page_mapping - walk all memory areas mapped into a struct address_space. + * @mapping: Pointer to the struct address_space + * @first_index: First page offset in the address_space + * @nr: Number of incremental page offsets to cover + * @ops: operation to call during the walk + * @private: private data for callbacks' usage + * + * This function walks all memory areas mapped into a struct address_space. + * The walk is limited to only the given page-size index range, but if + * the index boundaries cross a huge page-table entry, that entry will be + * included. + * + * Also see walk_page_range() for additional information. + * + * Locking: + * This function can't require that the struct mm_struct::mmap_lock is held, + * since @mapping may be mapped by multiple processes. Instead + * @mapping->i_mmap_rwsem must be held. This might have implications in the + * callbacks, and it's up tho the caller to ensure that the + * struct mm_struct::mmap_lock is not needed. + * + * Also this means that a caller can't rely on the struct + * vm_area_struct::vm_flags to be constant across a call, + * except for immutable flags. Callers requiring this shouldn't use + * this function. + * + * Return: 0 on success, negative error code on failure, positive number on + * caller defined premature termination. + */ +int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, + pgoff_t nr, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .private = private, + }; + struct vm_area_struct *vma; + pgoff_t vba, vea, cba, cea; + unsigned long start_addr, end_addr; + int err = 0; + + lockdep_assert_held(&mapping->i_mmap_rwsem); + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, + first_index + nr - 1) { + /* Clip to the vma */ + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma); + cba = first_index; + cba = max(cba, vba); + cea = first_index + nr; + cea = min(cea, vea); + + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; + if (start_addr >= end_addr) + continue; + + walk.vma = vma; + walk.mm = vma->vm_mm; + + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); + if (err > 0) { + err = 0; + break; + } else if (err < 0) + break; + + err = __walk_page_range(start_addr, end_addr, &walk); + if (err) + break; + } + + return err; +} diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h new file mode 100644 index 000000000..70b1ea23f --- /dev/null +++ b/mm/percpu-internal.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_PERCPU_INTERNAL_H +#define _MM_PERCPU_INTERNAL_H + +#include +#include + +/* + * pcpu_block_md is the metadata block struct. + * Each chunk's bitmap is split into a number of full blocks. + * All units are in terms of bits. + * + * The scan hint is the largest known contiguous area before the contig hint. + * It is not necessarily the actual largest contig hint though. There is an + * invariant that the scan_hint_start > contig_hint_start iff + * scan_hint == contig_hint. This is necessary because when scanning forward, + * we don't know if a new contig hint would be better than the current one. + */ +struct pcpu_block_md { + int scan_hint; /* scan hint for block */ + int scan_hint_start; /* block relative starting + position of the scan hint */ + int contig_hint; /* contig hint for block */ + int contig_hint_start; /* block relative starting + position of the contig hint */ + int left_free; /* size of free space along + the left side of the block */ + int right_free; /* size of free space along + the right side of the block */ + int first_free; /* block position of first free */ + int nr_bits; /* total bits responsible for */ +}; + +struct pcpu_chunk { +#ifdef CONFIG_PERCPU_STATS + int nr_alloc; /* # of allocations */ + size_t max_alloc_size; /* largest allocation size */ +#endif + + struct list_head list; /* linked to pcpu_slot lists */ + int free_bytes; /* free bytes in the chunk */ + struct pcpu_block_md chunk_md; + void *base_addr; /* base address of this chunk */ + + unsigned long *alloc_map; /* allocation map */ + unsigned long *bound_map; /* boundary map */ + struct pcpu_block_md *md_blocks; /* metadata blocks */ + + void *data; /* chunk data */ + bool immutable; /* no [de]population allowed */ + bool isolated; /* isolated from active chunk + slots */ + int start_offset; /* the overlap with the previous + region to have a page aligned + base_addr */ + int end_offset; /* additional area required to + have the region end page + aligned */ +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ +#endif + + int nr_pages; /* # of pages served by this chunk */ + int nr_populated; /* # of populated pages */ + int nr_empty_pop_pages; /* # of empty populated pages */ + unsigned long populated[]; /* populated bitmap */ +}; + +extern spinlock_t pcpu_lock; + +extern struct list_head *pcpu_chunk_lists; +extern int pcpu_nr_slots; +extern int pcpu_sidelined_slot; +extern int pcpu_to_depopulate_slot; +extern int pcpu_nr_empty_pop_pages; + +extern struct pcpu_chunk *pcpu_first_chunk; +extern struct pcpu_chunk *pcpu_reserved_chunk; + +/** + * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bitmap blocks used. + */ +static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) +{ + return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; +} + +/** + * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap + * @pages: number of physical pages + * + * This conversion is from physical pages to the number of bits + * required in the bitmap. + */ +static inline int pcpu_nr_pages_to_map_bits(int pages) +{ + return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; +} + +/** + * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap + * @chunk: chunk of interest + * + * This conversion is from the number of physical pages that the chunk + * serves to the number of bits in the bitmap. + */ +static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) +{ + return pcpu_nr_pages_to_map_bits(chunk->nr_pages); +} + +/** + * pcpu_obj_full_size - helper to calculate size of each accounted object + * @size: size of area to allocate in bytes + * + * For each accounted object there is an extra space which is used to store + * obj_cgroup membership. Charge it too. + */ +static inline size_t pcpu_obj_full_size(size_t size) +{ + size_t extra_size = 0; + +#ifdef CONFIG_MEMCG_KMEM + extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); +#endif + + return size * num_possible_cpus() + extra_size; +} + +#ifdef CONFIG_PERCPU_STATS + +#include + +struct percpu_stats { + u64 nr_alloc; /* lifetime # of allocations */ + u64 nr_dealloc; /* lifetime # of deallocations */ + u64 nr_cur_alloc; /* current # of allocations */ + u64 nr_max_alloc; /* max # of live allocations */ + u32 nr_chunks; /* current # of live chunks */ + u32 nr_max_chunks; /* max # of live chunks */ + size_t min_alloc_size; /* min allocation size */ + size_t max_alloc_size; /* max allocation size */ +}; + +extern struct percpu_stats pcpu_stats; +extern struct pcpu_alloc_info pcpu_stats_ai; + +/* + * For debug purposes. We don't care about the flexible array. + */ +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ + memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); + + /* initialize min_alloc_size to unit_size */ + pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; +} + +/* + * pcpu_stats_area_alloc - increment area allocation stats + * @chunk: the location of the area being allocated + * @size: size of area to allocate in bytes + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_alloc++; + pcpu_stats.nr_cur_alloc++; + pcpu_stats.nr_max_alloc = + max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); + pcpu_stats.min_alloc_size = + min(pcpu_stats.min_alloc_size, size); + pcpu_stats.max_alloc_size = + max(pcpu_stats.max_alloc_size, size); + + chunk->nr_alloc++; + chunk->max_alloc_size = max(chunk->max_alloc_size, size); +} + +/* + * pcpu_stats_area_dealloc - decrement allocation stats + * @chunk: the location of the area being deallocated + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_dealloc++; + pcpu_stats.nr_cur_alloc--; + + chunk->nr_alloc--; +} + +/* + * pcpu_stats_chunk_alloc - increment chunk stats + */ +static inline void pcpu_stats_chunk_alloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks++; + pcpu_stats.nr_max_chunks = + max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +/* + * pcpu_stats_chunk_dealloc - decrement chunk stats + */ +static inline void pcpu_stats_chunk_dealloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks--; + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +#else + +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ +} + +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ +} + +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ +} + +static inline void pcpu_stats_chunk_alloc(void) +{ +} + +static inline void pcpu_stats_chunk_dealloc(void) +{ +} + +#endif /* !CONFIG_PERCPU_STATS */ + +#endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c new file mode 100644 index 000000000..fe31aa19d --- /dev/null +++ b/mm/percpu-km.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/percpu-km.c - kernel memory based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo + * + * Chunks are allocated as a contiguous kernel memory using gfp + * allocation. This is to be used on nommu architectures. + * + * To use percpu-km, + * + * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig. + * + * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's + * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work + * fine. + * + * - NUMA is not supported. When setting up the first chunk, + * @cpu_distance_fn should be NULL or report all CPUs to be nearer + * than or at LOCAL_DISTANCE. + * + * - It's best if the chunk size is power of two multiple of + * PAGE_SIZE. Because each chunk is allocated as a contiguous + * kernel memory block using alloc_pages(), memory will be wasted if + * chunk size is not aligned. percpu-km code will whine about it. + */ + +#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#error "contiguous percpu allocation is incompatible with paged first chunk" +#endif + +#include + +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + /* nothing */ +} + +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end, gfp_t gfp) +{ + return 0; +} + +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + /* nada */ +} + +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + struct pcpu_chunk *chunk; + struct page *pages; + unsigned long flags; + int i; + + chunk = pcpu_alloc_chunk(gfp); + if (!chunk) + return NULL; + + pages = alloc_pages(gfp, order_base_2(nr_pages)); + if (!pages) { + pcpu_free_chunk(chunk); + return NULL; + } + + for (i = 0; i < nr_pages; i++) + pcpu_set_page_chunk(nth_page(pages, i), chunk); + + chunk->data = pages; + chunk->base_addr = page_address(pages); + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_populated(chunk, 0, nr_pages); + spin_unlock_irqrestore(&pcpu_lock, flags); + + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) + __free_pages(chunk->data, order_base_2(nr_pages)); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return virt_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + size_t nr_pages, alloc_pages; + + /* all units must be in a single group */ + if (ai->nr_groups != 1) { + pr_crit("can't handle more than one group\n"); + return -EINVAL; + } + + nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT; + alloc_pages = roundup_pow_of_two(nr_pages); + + if (alloc_pages > nr_pages) + pr_warn("wasting %zu pages per chunk\n", + alloc_pages - nr_pages); + + return 0; +} + +static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) +{ + return false; +} diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c new file mode 100644 index 000000000..dd3590dfc --- /dev/null +++ b/mm/percpu-stats.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/percpu-debug.c + * + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou + * + * Prints statistics about the percpu allocator and backing chunks. + */ +#include +#include +#include +#include +#include +#include + +#include "percpu-internal.h" + +#define P(X, Y) \ + seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y) + +struct percpu_stats pcpu_stats; +struct pcpu_alloc_info pcpu_stats_ai; + +static int cmpint(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +/* + * Iterates over all chunks to find the max nr_alloc entries. + */ +static int find_max_nr_alloc(void) +{ + struct pcpu_chunk *chunk; + int slot, max_nr_alloc; + + max_nr_alloc = 0; + for (slot = 0; slot < pcpu_nr_slots; slot++) + list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) + max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc); + + return max_nr_alloc; +} + +/* + * Prints out chunk state. Fragmentation is considered between + * the beginning of the chunk to the last allocation. + * + * All statistics are in bytes unless stated otherwise. + */ +static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, + int *buffer) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int i, last_alloc, as_len, start, end; + int *alloc_sizes, *p; + /* statistics */ + int sum_frag = 0, max_frag = 0; + int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; + + alloc_sizes = buffer; + + /* + * find_last_bit returns the start value if nothing found. + * Therefore, we must determine if it is a failure of find_last_bit + * and set the appropriate value. + */ + last_alloc = find_last_bit(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - + chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1); + last_alloc = test_bit(last_alloc, chunk->alloc_map) ? + last_alloc + 1 : 0; + + as_len = 0; + start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; + + /* + * If a bit is set in the allocation map, the bound_map identifies + * where the allocation ends. If the allocation is not set, the + * bound_map does not identify free areas as it is only kept accurate + * on allocation, not free. + * + * Positive values are allocations and negative values are free + * fragments. + */ + while (start < last_alloc) { + if (test_bit(start, chunk->alloc_map)) { + end = find_next_bit(chunk->bound_map, last_alloc, + start + 1); + alloc_sizes[as_len] = 1; + } else { + end = find_next_bit(chunk->alloc_map, last_alloc, + start + 1); + alloc_sizes[as_len] = -1; + } + + alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE; + + start = end; + } + + /* + * The negative values are free fragments and thus sorting gives the + * free fragments at the beginning in largest first order. + */ + if (as_len > 0) { + sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL); + + /* iterate through the unallocated fragments */ + for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { + sum_frag -= *p; + max_frag = max(max_frag, -1 * (*p)); + } + + cur_min_alloc = alloc_sizes[i]; + cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2]; + cur_max_alloc = alloc_sizes[as_len - 1]; + } + + P("nr_alloc", chunk->nr_alloc); + P("max_alloc_size", chunk->max_alloc_size); + P("empty_pop_pages", chunk->nr_empty_pop_pages); + P("first_bit", chunk_md->first_free); + P("free_bytes", chunk->free_bytes); + P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); + P("sum_frag", sum_frag); + P("max_frag", max_frag); + P("cur_min_alloc", cur_min_alloc); + P("cur_med_alloc", cur_med_alloc); + P("cur_max_alloc", cur_max_alloc); + seq_putc(m, '\n'); +} + +static int percpu_stats_show(struct seq_file *m, void *v) +{ + struct pcpu_chunk *chunk; + int slot, max_nr_alloc; + int *buffer; + +alloc_buffer: + spin_lock_irq(&pcpu_lock); + max_nr_alloc = find_max_nr_alloc(); + spin_unlock_irq(&pcpu_lock); + + /* there can be at most this many free and allocated fragments */ + buffer = vmalloc_array(2 * max_nr_alloc + 1, sizeof(int)); + if (!buffer) + return -ENOMEM; + + spin_lock_irq(&pcpu_lock); + + /* if the buffer allocated earlier is too small */ + if (max_nr_alloc < find_max_nr_alloc()) { + spin_unlock_irq(&pcpu_lock); + vfree(buffer); + goto alloc_buffer; + } + +#define PL(X) \ + seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X) + + seq_printf(m, + "Percpu Memory Statistics\n" + "Allocation Info:\n" + "----------------------------------------\n"); + PL(unit_size); + PL(static_size); + PL(reserved_size); + PL(dyn_size); + PL(atom_size); + PL(alloc_size); + seq_putc(m, '\n'); + +#undef PL + +#define PU(X) \ + seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X) + + seq_printf(m, + "Global Stats:\n" + "----------------------------------------\n"); + PU(nr_alloc); + PU(nr_dealloc); + PU(nr_cur_alloc); + PU(nr_max_alloc); + PU(nr_chunks); + PU(nr_max_chunks); + PU(min_alloc_size); + PU(max_alloc_size); + P("empty_pop_pages", pcpu_nr_empty_pop_pages); + seq_putc(m, '\n'); + +#undef PU + + seq_printf(m, + "Per Chunk Stats:\n" + "----------------------------------------\n"); + + if (pcpu_reserved_chunk) { + seq_puts(m, "Chunk: <- Reserved Chunk\n"); + chunk_map_stats(m, pcpu_reserved_chunk, buffer); + } + + for (slot = 0; slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) { + if (chunk == pcpu_first_chunk) + seq_puts(m, "Chunk: <- First Chunk\n"); + else if (slot == pcpu_to_depopulate_slot) + seq_puts(m, "Chunk (to_depopulate)\n"); + else if (slot == pcpu_sidelined_slot) + seq_puts(m, "Chunk (sidelined):\n"); + else + seq_puts(m, "Chunk:\n"); + chunk_map_stats(m, chunk, buffer); + } + } + + spin_unlock_irq(&pcpu_lock); + + vfree(buffer); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(percpu_stats); + +static int __init init_percpu_stats_debugfs(void) +{ + debugfs_create_file("percpu_stats", 0444, NULL, NULL, + &percpu_stats_fops); + + return 0; +} + +late_initcall(init_percpu_stats_debugfs); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c new file mode 100644 index 000000000..2054c9213 --- /dev/null +++ b/mm/percpu-vm.c @@ -0,0 +1,410 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/percpu-vm.c - vmalloc area based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo + * + * Chunks are mapped into vmalloc areas and populated page by page. + * This is the default chunk allocator. + */ +#include "internal.h" + +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); +} + +/** + * pcpu_get_pages - get temp pages array + * + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. + * + * RETURNS: + * Pointer to temp pages array on success. + */ +static struct page **pcpu_get_pages(void) +{ + static struct page **pages; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + + lockdep_assert_held(&pcpu_alloc_mutex); + + if (!pages) + pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * @gfp: allocation flags passed to the underlying allocator + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end, + gfp_t gfp) +{ + unsigned int cpu, tcpu; + int i; + + gfp |= __GFP_HIGHMEM; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) + goto err; + } + } + return 0; + +err: + while (--i >= page_start) + __free_page(pages[pcpu_page_idx(cpu, i)]); + + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + for (i = page_start; i < page_end; i++) + __free_page(pages[pcpu_page_idx(tcpu, i)]); + } + return -ENOMEM; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT)); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; + + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT), + PAGE_KERNEL, pages, PAGE_SHIFT); +} + +/** + * pcpu_map_pages - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). + */ +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, int page_start, int page_end) +{ + unsigned int cpu, tcpu; + int i, err; + + for_each_possible_cpu(cpu) { + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); + if (err < 0) + goto err; + + for (i = page_start; i < page_end; i++) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + } + return 0; +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: the start page + * @page_end: the end page + * @gfp: allocation flags passed to the underlying memory allocator + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end, gfp_t gfp) +{ + struct page **pages; + + pages = pcpu_get_pages(); + if (!pages) + return -ENOMEM; + + if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) + return -ENOMEM; + + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; + } + pcpu_post_map_flush(chunk, page_start, page_end); + + return 0; +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @page_start: the start page + * @page_end: the end page + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. + * + * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the + * region back to vmalloc() which will lazily flush the tlb. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + struct page **pages; + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages(); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_unmap_pages(chunk, pages, page_start, page_end); + + pcpu_free_pages(chunk, pages, page_start, page_end); +} + +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) +{ + struct pcpu_chunk *chunk; + struct vm_struct **vms; + + chunk = pcpu_alloc_chunk(gfp); + if (!chunk) + return NULL; + + vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size); + if (!vms) { + pcpu_free_chunk(chunk); + return NULL; + } + + chunk->data = vms; + chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) + pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return vmalloc_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + /* no extra restriction */ + return 0; +} + +/** + * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim + * @chunk: chunk of interest + * + * This is the entry point for percpu reclaim. If a chunk qualifies, it is then + * isolated and managed in separate lists at the back of pcpu_slot: sidelined + * and to_depopulate respectively. The to_depopulate list holds chunks slated + * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once + * they are on this list. Once depopulated, they are moved onto the sidelined + * list which enables them to be pulled back in for allocation if no other chunk + * can suffice the allocation. + */ +static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) +{ + /* do not reclaim either the first chunk or reserved chunk */ + if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) + return false; + + /* + * If it is isolated, it may be on the sidelined list so move it back to + * the to_depopulate list. If we hit at least 1/4 pages empty pages AND + * there is no system-wide shortage of empty pages aside from this + * chunk, move it to the to_depopulate list. + */ + return ((chunk->isolated && chunk->nr_empty_pop_pages) || + (pcpu_nr_empty_pop_pages > + (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) && + chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); +} diff --git a/mm/percpu.c b/mm/percpu.c new file mode 100644 index 000000000..27697b242 --- /dev/null +++ b/mm/percpu.c @@ -0,0 +1,3461 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo + * + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou + * + * The percpu allocator handles both static and dynamic areas. Percpu + * areas are allocated in chunks which are divided into units. There is + * a 1-to-1 mapping for units to possible cpus. These units are grouped + * based on NUMA properties of the machine. + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done by offsets into a unit's address space. Ie., an + * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, + * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear + * and even sparse. Access is handled by configuring percpu base + * registers according to the cpu to unit mappings and offsetting the + * base address using pcpu_unit_size. + * + * There is special consideration for the first chunk which must handle + * the static percpu variables in the kernel image as allocation services + * are not online yet. In short, the first chunk is structured like so: + * + * + * + * The static data is copied from the original section managed by the + * linker. The reserved section, if non-zero, primarily manages static + * percpu variables from kernel modules. Finally, the dynamic section + * takes care of normal allocations. + * + * The allocator organizes chunks into lists according to free size and + * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT + * flag should be passed. All memcg-aware allocations are sharing one set + * of chunks and all unaccounted allocations and allocations performed + * by processes belonging to the root memory cgroup are using the second set. + * + * The allocator tries to allocate from the fullest chunk first. Each chunk + * is managed by a bitmap with metadata blocks. The allocation map is updated + * on every allocation and free to reflect the current state while the boundary + * map is only updated on allocation. Each metadata block contains + * information to help mitigate the need to iterate over large portions + * of the bitmap. The reverse mapping from page to chunk is stored in + * the page's index. Lastly, units are lazily backed and grow in unison. + * + * There is a unique conversion that goes on here between bytes and bits. + * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk + * tracks the number of pages it is responsible for in nr_pages. Helper + * functions are used to convert from between the bytes, bits, and blocks. + * All hints are managed in bits unless explicitly stated. + * + * To use this allocator, arch code should do the following: + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back if they need to be + * different from the default + * + * - use pcpu_setup_first_chunk() during percpu area initialization to + * setup the first chunk containing the kernel static percpu area + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include "percpu-internal.h" + +/* + * The slots are sorted by the size of the biggest continuous free area. + * 1-31 bytes share the same slot. + */ +#define PCPU_SLOT_BASE_SHIFT 5 +/* chunks in slots below this are subject to being sidelined on failed alloc */ +#define PCPU_SLOT_FAIL_THRESHOLD 3 + +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 + +#ifdef CONFIG_SMP +/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ +#ifndef __addr_to_pcpu_ptr +#define __addr_to_pcpu_ptr(addr) \ + (void __percpu *)((unsigned long)(addr) - \ + (unsigned long)pcpu_base_addr + \ + (unsigned long)__per_cpu_start) +#endif +#ifndef __pcpu_ptr_to_addr +#define __pcpu_ptr_to_addr(ptr) \ + (void __force *)((unsigned long)(ptr) + \ + (unsigned long)pcpu_base_addr - \ + (unsigned long)__per_cpu_start) +#endif +#else /* CONFIG_SMP */ +/* on UP, it's always identity mapped */ +#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) +#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) +#endif /* CONFIG_SMP */ + +static int pcpu_unit_pages __ro_after_init; +static int pcpu_unit_size __ro_after_init; +static int pcpu_nr_units __ro_after_init; +static int pcpu_atom_size __ro_after_init; +int pcpu_nr_slots __ro_after_init; +static int pcpu_free_slot __ro_after_init; +int pcpu_sidelined_slot __ro_after_init; +int pcpu_to_depopulate_slot __ro_after_init; +static size_t pcpu_chunk_struct_size __ro_after_init; + +/* cpus with the lowest and highest unit addresses */ +static unsigned int pcpu_low_unit_cpu __ro_after_init; +static unsigned int pcpu_high_unit_cpu __ro_after_init; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr __ro_after_init; + +static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ + +/* group information, used for vm allocation */ +static int pcpu_nr_groups __ro_after_init; +static const unsigned long *pcpu_group_offsets __ro_after_init; +static const size_t *pcpu_group_sizes __ro_after_init; + +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +struct pcpu_chunk *pcpu_first_chunk __ro_after_init; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. When the reserved + * region doesn't exist, the following variable is NULL. + */ +struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; + +DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ + +struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ + +/* chunks which need their map areas extended, protected by pcpu_lock */ +static LIST_HEAD(pcpu_map_extend_chunks); + +/* + * The number of empty populated pages, protected by pcpu_lock. + * The reserved chunk doesn't contribute to the count. + */ +int pcpu_nr_empty_pop_pages; + +/* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; + +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} + +/** + * pcpu_addr_in_chunk - check if the address is served from this chunk + * @chunk: chunk of interest + * @addr: percpu address + * + * RETURNS: + * True if the address is served from this chunk. + */ +static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) +{ + void *start_addr, *end_addr; + + if (!chunk) + return false; + + start_addr = chunk->base_addr + chunk->start_offset; + end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - + chunk->end_offset; + + return addr >= start_addr && addr < end_addr; +} + +static int __pcpu_size_to_slot(int size) +{ + int highbit = fls(size); /* size is in bytes */ + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_size_to_slot(int size) +{ + if (size == pcpu_unit_size) + return pcpu_free_slot; + return __pcpu_size_to_slot(size); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + const struct pcpu_block_md *chunk_md = &chunk->chunk_md; + + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || + chunk_md->contig_hint == 0) + return 0; + + return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); +} + +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + +static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; +} + +static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) +{ + return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->base_addr + + pcpu_unit_page_offset(cpu, page_idx); +} + +/* + * The following are helper functions to help access bitmaps and convert + * between bitmap offsets to address offsets. + */ +static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) +{ + return chunk->alloc_map + + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); +} + +static unsigned long pcpu_off_to_block_index(int off) +{ + return off / PCPU_BITMAP_BLOCK_BITS; +} + +static unsigned long pcpu_off_to_block_off(int off) +{ + return off & (PCPU_BITMAP_BLOCK_BITS - 1); +} + +static unsigned long pcpu_block_off_to_off(int index, int off) +{ + return index * PCPU_BITMAP_BLOCK_BITS + off; +} + +/** + * pcpu_check_block_hint - check against the contig hint + * @block: block of interest + * @bits: size of allocation + * @align: alignment of area (max PAGE_SIZE) + * + * Check to see if the allocation can fit in the block's contig hint. + * Note, a chunk uses the same hints as a block so this can also check against + * the chunk's contig hint. + */ +static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits, + size_t align) +{ + int bit_off = ALIGN(block->contig_hint_start, align) - + block->contig_hint_start; + + return bit_off + bits <= block->contig_hint; +} + +/* + * pcpu_next_hint - determine which hint to use + * @block: block of interest + * @alloc_bits: size of allocation + * + * This determines if we should scan based on the scan_hint or first_free. + * In general, we want to scan from first_free to fulfill allocations by + * first fit. However, if we know a scan_hint at position scan_hint_start + * cannot fulfill an allocation, we can begin scanning from there knowing + * the contig_hint will be our fallback. + */ +static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) +{ + /* + * The three conditions below determine if we can skip past the + * scan_hint. First, does the scan hint exist. Second, is the + * contig_hint after the scan_hint (possibly not true iff + * contig_hint == scan_hint). Third, is the allocation request + * larger than the scan_hint. + */ + if (block->scan_hint && + block->contig_hint_start > block->scan_hint_start && + alloc_bits > block->scan_hint) + return block->scan_hint_start + block->scan_hint; + + return block->first_free; +} + +/** + * pcpu_next_md_free_region - finds the next hint free area + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Helper function for pcpu_for_each_md_free_region. It checks + * block->contig_hint and performs aggregation across blocks to find the + * next hint. It modifies bit_off and bits in-place to be consumed in the + * loop. + */ +static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, + int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + return; + } + + /* + * This checks three things. First is there a contig_hint to + * check. Second, have we checked this hint before by + * comparing the block_off. Third, is this the same as the + * right contig hint. In the last case, it spills over into + * the next block and should be handled by the contig area + * across blocks code. + */ + *bits = block->contig_hint; + if (*bits && block->contig_hint_start >= block_off && + *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { + *bit_off = pcpu_block_off_to_off(i, + block->contig_hint_start); + return; + } + /* reset to satisfy the second predicate above */ + block_off = 0; + + *bits = block->right_free; + *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; + } +} + +/** + * pcpu_next_fit_region - finds fit areas for a given allocation request + * @chunk: chunk of interest + * @alloc_bits: size of allocation + * @align: alignment of area (max PAGE_SIZE) + * @bit_off: chunk offset + * @bits: size of free area + * + * Finds the next free region that is viable for use with a given size and + * alignment. This only returns if there is a valid area to be used for this + * allocation. block->first_free is returned if the allocation request fits + * within the block to see if the request can be fulfilled prior to the contig + * hint. + */ +static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, + int align, int *bit_off, int *bits) +{ + int i = pcpu_off_to_block_index(*bit_off); + int block_off = pcpu_off_to_block_off(*bit_off); + struct pcpu_block_md *block; + + *bits = 0; + for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); + block++, i++) { + /* handles contig area across blocks */ + if (*bits) { + *bits += block->left_free; + if (*bits >= alloc_bits) + return; + if (block->left_free == PCPU_BITMAP_BLOCK_BITS) + continue; + } + + /* check block->contig_hint */ + *bits = ALIGN(block->contig_hint_start, align) - + block->contig_hint_start; + /* + * This uses the block offset to determine if this has been + * checked in the prior iteration. + */ + if (block->contig_hint && + block->contig_hint_start >= block_off && + block->contig_hint >= *bits + alloc_bits) { + int start = pcpu_next_hint(block, alloc_bits); + + *bits += alloc_bits + block->contig_hint_start - + start; + *bit_off = pcpu_block_off_to_off(i, start); + return; + } + /* reset to satisfy the second predicate above */ + block_off = 0; + + *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, + align); + *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; + *bit_off = pcpu_block_off_to_off(i, *bit_off); + if (*bits >= alloc_bits) + return; + } + + /* no valid offsets were found - fail condition */ + *bit_off = pcpu_chunk_map_bits(chunk); +} + +/* + * Metadata free area iterators. These perform aggregation of free areas + * based on the metadata blocks and return the offset @bit_off and size in + * bits of the free area @bits. pcpu_for_each_fit_region only returns when + * a fit is found for the allocation request. + */ +#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ + for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits) + 1, \ + pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) + +#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ + for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits)); \ + (bit_off) < pcpu_chunk_map_bits((chunk)); \ + (bit_off) += (bits), \ + pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ + &(bits))) + +/** + * pcpu_mem_zalloc - allocate memory + * @size: bytes to allocate + * @gfp: allocation flags + * + * Allocate @size bytes. If @size is smaller than PAGE_SIZE, + * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. + * This is to facilitate passing through whitelisted flags. The + * returned memory is always zeroed. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) +{ + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + + if (size <= PAGE_SIZE) + return kzalloc(size, gfp); + else + return __vmalloc(size, gfp | __GFP_ZERO); +} + +/** + * pcpu_mem_free - free memory + * @ptr: memory to free + * + * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). + */ +static void pcpu_mem_free(void *ptr) +{ + kvfree(ptr); +} + +static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, + bool move_front) +{ + if (chunk != pcpu_reserved_chunk) { + if (move_front) + list_move(&chunk->list, &pcpu_chunk_lists[slot]); + else + list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]); + } +} + +static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) +{ + __pcpu_chunk_move(chunk, slot, true); +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. Note that the reserved chunk is never put on + * chunk slots. + * + * CONTEXT: + * pcpu_lock. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + /* leave isolated chunks in-place */ + if (chunk->isolated) + return; + + if (oslot != nslot) + __pcpu_chunk_move(chunk, nslot, oslot < nslot); +} + +static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) +{ + lockdep_assert_held(&pcpu_lock); + + if (!chunk->isolated) { + chunk->isolated = true; + pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages; + } + list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]); +} + +static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) +{ + lockdep_assert_held(&pcpu_lock); + + if (chunk->isolated) { + chunk->isolated = false; + pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages; + pcpu_chunk_relocate(chunk, -1); + } +} + +/* + * pcpu_update_empty_pages - update empty page counters + * @chunk: chunk of interest + * @nr: nr of empty pages + * + * This is used to keep track of the empty pages now based on the premise + * a md_block covers a page. The hint update functions recognize if a block + * is made full or broken to calculate deltas for keeping track of free pages. + */ +static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) +{ + chunk->nr_empty_pop_pages += nr; + if (chunk != pcpu_reserved_chunk && !chunk->isolated) + pcpu_nr_empty_pop_pages += nr; +} + +/* + * pcpu_region_overlap - determines if two regions overlap + * @a: start of first region, inclusive + * @b: end of first region, exclusive + * @x: start of second region, inclusive + * @y: end of second region, exclusive + * + * This is used to determine if the hint region [a, b) overlaps with the + * allocated region [x, y). + */ +static inline bool pcpu_region_overlap(int a, int b, int x, int y) +{ + return (a < y) && (x < b); +} + +/** + * pcpu_block_update - updates a block given a free area + * @block: block of interest + * @start: start offset in block + * @end: end offset in block + * + * Updates a block given a known free area. The region [start, end) is + * expected to be the entirety of the free area within a block. Chooses + * the best starting offset if the contig hints are equal. + */ +static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) +{ + int contig = end - start; + + block->first_free = min(block->first_free, start); + if (start == 0) + block->left_free = contig; + + if (end == block->nr_bits) + block->right_free = contig; + + if (contig > block->contig_hint) { + /* promote the old contig_hint to be the new scan_hint */ + if (start > block->contig_hint_start) { + if (block->contig_hint > block->scan_hint) { + block->scan_hint_start = + block->contig_hint_start; + block->scan_hint = block->contig_hint; + } else if (start < block->scan_hint_start) { + /* + * The old contig_hint == scan_hint. But, the + * new contig is larger so hold the invariant + * scan_hint_start < contig_hint_start. + */ + block->scan_hint = 0; + } + } else { + block->scan_hint = 0; + } + block->contig_hint_start = start; + block->contig_hint = contig; + } else if (contig == block->contig_hint) { + if (block->contig_hint_start && + (!start || + __ffs(start) > __ffs(block->contig_hint_start))) { + /* start has a better alignment so use it */ + block->contig_hint_start = start; + if (start < block->scan_hint_start && + block->contig_hint > block->scan_hint) + block->scan_hint = 0; + } else if (start > block->scan_hint_start || + block->contig_hint > block->scan_hint) { + /* + * Knowing contig == contig_hint, update the scan_hint + * if it is farther than or larger than the current + * scan_hint. + */ + block->scan_hint_start = start; + block->scan_hint = contig; + } + } else { + /* + * The region is smaller than the contig_hint. So only update + * the scan_hint if it is larger than or equal and farther than + * the current scan_hint. + */ + if ((start < block->contig_hint_start && + (contig > block->scan_hint || + (contig == block->scan_hint && + start > block->scan_hint_start)))) { + block->scan_hint_start = start; + block->scan_hint = contig; + } + } +} + +/* + * pcpu_block_update_scan - update a block given a free area from a scan + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Finding the final allocation spot first goes through pcpu_find_block_fit() + * to find a block that can hold the allocation and then pcpu_alloc_area() + * where a scan is used. When allocations require specific alignments, + * we can inadvertently create holes which will not be seen in the alloc + * or free paths. + * + * This takes a given free area hole and updates a block as it may change the + * scan_hint. We need to scan backwards to ensure we don't miss free bits + * from alignment. + */ +static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + int s_off = pcpu_off_to_block_off(bit_off); + int e_off = s_off + bits; + int s_index, l_bit; + struct pcpu_block_md *block; + + if (e_off > PCPU_BITMAP_BLOCK_BITS) + return; + + s_index = pcpu_off_to_block_index(bit_off); + block = chunk->md_blocks + s_index; + + /* scan backwards in case of alignment skipping free bits */ + l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); + s_off = (s_off == l_bit) ? 0 : l_bit + 1; + + pcpu_block_update(block, s_off, e_off); +} + +/** + * pcpu_chunk_refresh_hint - updates metadata about a chunk + * @chunk: chunk of interest + * @full_scan: if we should scan from the beginning + * + * Iterates over the metadata blocks to find the largest contig area. + * A full scan can be avoided on the allocation path as this is triggered + * if we broke the contig_hint. In doing so, the scan_hint will be before + * the contig_hint or after if the scan_hint == contig_hint. This cannot + * be prevented on freeing as we want to find the largest area possibly + * spanning blocks. + */ +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int bit_off, bits; + + /* promote scan_hint to contig_hint */ + if (!full_scan && chunk_md->scan_hint) { + bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; + chunk_md->contig_hint_start = chunk_md->scan_hint_start; + chunk_md->contig_hint = chunk_md->scan_hint; + chunk_md->scan_hint = 0; + } else { + bit_off = chunk_md->first_free; + chunk_md->contig_hint = 0; + } + + bits = 0; + pcpu_for_each_md_free_region(chunk, bit_off, bits) + pcpu_block_update(chunk_md, bit_off, bit_off + bits); +} + +/** + * pcpu_block_refresh_hint + * @chunk: chunk of interest + * @index: index of the metadata block + * + * Scans over the block beginning at first_free and updates the block + * metadata accordingly. + */ +static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) +{ + struct pcpu_block_md *block = chunk->md_blocks + index; + unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); + unsigned int start, end; /* region start, region end */ + + /* promote scan_hint to contig_hint */ + if (block->scan_hint) { + start = block->scan_hint_start + block->scan_hint; + block->contig_hint_start = block->scan_hint_start; + block->contig_hint = block->scan_hint; + block->scan_hint = 0; + } else { + start = block->first_free; + block->contig_hint = 0; + } + + block->right_free = 0; + + /* iterate over free areas and update the contig hints */ + for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) + pcpu_block_update(block, start, end); +} + +/** + * pcpu_block_update_hint_alloc - update hint on allocation path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + * + * Updates metadata for the allocation path. The metadata only has to be + * refreshed by a full scan iff the chunk's contig hint is broken. Block level + * scans are required if the block's contig hint is broken. + */ +static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int nr_empty_pages = 0; + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ + + /* + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. + */ + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; + + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + + /* + * Update s_block. + * block->first_free must be updated if the allocation takes its place. + * If the allocation breaks the contig_hint, a scan is required to + * restore this hint. + */ + if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + + if (s_off == s_block->first_free) + s_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, s_index), + PCPU_BITMAP_BLOCK_BITS, + s_off + bits); + + if (pcpu_region_overlap(s_block->scan_hint_start, + s_block->scan_hint_start + s_block->scan_hint, + s_off, + s_off + bits)) + s_block->scan_hint = 0; + + if (pcpu_region_overlap(s_block->contig_hint_start, + s_block->contig_hint_start + + s_block->contig_hint, + s_off, + s_off + bits)) { + /* block contig hint is broken - scan to fix it */ + if (!s_off) + s_block->left_free = 0; + pcpu_block_refresh_hint(chunk, s_index); + } else { + /* update left and right contig manually */ + s_block->left_free = min(s_block->left_free, s_off); + if (s_index == e_index) + s_block->right_free = min_t(int, s_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + else + s_block->right_free = 0; + } + + /* + * Update e_block. + */ + if (s_index != e_index) { + if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + + /* + * When the allocation is across blocks, the end is along + * the left part of the e_block. + */ + e_block->first_free = find_next_zero_bit( + pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, e_off); + + if (e_off == PCPU_BITMAP_BLOCK_BITS) { + /* reset the block */ + e_block++; + } else { + if (e_off > e_block->scan_hint_start) + e_block->scan_hint = 0; + + e_block->left_free = 0; + if (e_off > e_block->contig_hint_start) { + /* contig hint is broken - scan to fix it */ + pcpu_block_refresh_hint(chunk, e_index); + } else { + e_block->right_free = + min_t(int, e_block->right_free, + PCPU_BITMAP_BLOCK_BITS - e_off); + } + } + + /* update in-between md_blocks */ + nr_empty_pages += (e_index - s_index - 1); + for (block = s_block + 1; block < e_block; block++) { + block->scan_hint = 0; + block->contig_hint = 0; + block->left_free = 0; + block->right_free = 0; + } + } + + if (nr_empty_pages) + pcpu_update_empty_pages(chunk, -nr_empty_pages); + + if (pcpu_region_overlap(chunk_md->scan_hint_start, + chunk_md->scan_hint_start + + chunk_md->scan_hint, + bit_off, + bit_off + bits)) + chunk_md->scan_hint = 0; + + /* + * The only time a full chunk scan is required is if the chunk + * contig hint is broken. Otherwise, it means a smaller space + * was used and therefore the chunk contig hint is still correct. + */ + if (pcpu_region_overlap(chunk_md->contig_hint_start, + chunk_md->contig_hint_start + + chunk_md->contig_hint, + bit_off, + bit_off + bits)) + pcpu_chunk_refresh_hint(chunk, false); +} + +/** + * pcpu_block_update_hint_free - updates the block hints on the free path + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of request + * + * Updates metadata for the allocation path. This avoids a blind block + * refresh by making use of the block contig hints. If this fails, it scans + * forward and backward to determine the extent of the free area. This is + * capped at the boundary of blocks. + * + * A chunk update is triggered if a page becomes free, a block becomes free, + * or the free spans across blocks. This tradeoff is to minimize iterating + * over the block metadata to update chunk_md->contig_hint. + * chunk_md->contig_hint may be off by up to a page, but it will never be more + * than the available space. If the contig hint is contained in one block, it + * will be accurate. + */ +static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + int nr_empty_pages = 0; + struct pcpu_block_md *s_block, *e_block, *block; + int s_index, e_index; /* block indexes of the freed allocation */ + int s_off, e_off; /* block offsets of the freed allocation */ + int start, end; /* start and end of the whole free area */ + + /* + * Calculate per block offsets. + * The calculation uses an inclusive range, but the resulting offsets + * are [start, end). e_index always points to the last block in the + * range. + */ + s_index = pcpu_off_to_block_index(bit_off); + e_index = pcpu_off_to_block_index(bit_off + bits - 1); + s_off = pcpu_off_to_block_off(bit_off); + e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; + + s_block = chunk->md_blocks + s_index; + e_block = chunk->md_blocks + e_index; + + /* + * Check if the freed area aligns with the block->contig_hint. + * If it does, then the scan to find the beginning/end of the + * larger free area can be avoided. + * + * start and end refer to beginning and end of the free area + * within each their respective blocks. This is not necessarily + * the entire free area as it may span blocks past the beginning + * or end of the block. + */ + start = s_off; + if (s_off == s_block->contig_hint + s_block->contig_hint_start) { + start = s_block->contig_hint_start; + } else { + /* + * Scan backwards to find the extent of the free area. + * find_last_bit returns the starting bit, so if the start bit + * is returned, that means there was no last bit and the + * remainder of the chunk is free. + */ + int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), + start); + start = (start == l_bit) ? 0 : l_bit + 1; + } + + end = e_off; + if (e_off == e_block->contig_hint_start) + end = e_block->contig_hint_start + e_block->contig_hint; + else + end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), + PCPU_BITMAP_BLOCK_BITS, end); + + /* update s_block */ + e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; + if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + pcpu_block_update(s_block, start, e_off); + + /* freeing in the same block */ + if (s_index != e_index) { + /* update e_block */ + if (end == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + pcpu_block_update(e_block, 0, end); + + /* reset md_blocks in the middle */ + nr_empty_pages += (e_index - s_index - 1); + for (block = s_block + 1; block < e_block; block++) { + block->first_free = 0; + block->scan_hint = 0; + block->contig_hint_start = 0; + block->contig_hint = PCPU_BITMAP_BLOCK_BITS; + block->left_free = PCPU_BITMAP_BLOCK_BITS; + block->right_free = PCPU_BITMAP_BLOCK_BITS; + } + } + + if (nr_empty_pages) + pcpu_update_empty_pages(chunk, nr_empty_pages); + + /* + * Refresh chunk metadata when the free makes a block free or spans + * across blocks. The contig_hint may be off by up to a page, but if + * the contig_hint is contained in a block, it will be accurate with + * the else condition below. + */ + if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) + pcpu_chunk_refresh_hint(chunk, true); + else + pcpu_block_update(&chunk->chunk_md, + pcpu_block_off_to_off(s_index, start), + end); +} + +/** + * pcpu_is_populated - determines if the region is populated + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of area + * @next_off: return value for the next offset to start searching + * + * For atomic allocations, check if the backing pages are populated. + * + * RETURNS: + * Bool if the backing pages are populated. + * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. + */ +static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, + int *next_off) +{ + unsigned int start, end; + + start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + + start = find_next_zero_bit(chunk->populated, end, start); + if (start >= end) + return true; + + end = find_next_bit(chunk->populated, end, start + 1); + + *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + return false; +} + +/** + * pcpu_find_block_fit - finds the block index to start searching + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE bytes) + * @pop_only: use populated regions only + * + * Given a chunk and an allocation spec, find the offset to begin searching + * for a free region. This iterates over the bitmap metadata blocks to + * find an offset that will be guaranteed to fit the requirements. It is + * not quite first fit as if the allocation does not fit in the contig hint + * of a block or chunk, it is skipped. This errs on the side of caution + * to prevent excess iteration. Poor alignment can cause the allocator to + * skip over blocks and chunks that have valid free areas. + * + * RETURNS: + * The offset in the bitmap to begin searching. + * -1 if no offset is found. + */ +static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, bool pop_only) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int bit_off, bits, next_off; + + /* + * This is an optimization to prevent scanning by assuming if the + * allocation cannot fit in the global hint, there is memory pressure + * and creating a new chunk would happen soon. + */ + if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) + return -1; + + bit_off = pcpu_next_hint(chunk_md, alloc_bits); + bits = 0; + pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { + if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, + &next_off)) + break; + + bit_off = next_off; + bits = 0; + } + + if (bit_off == pcpu_chunk_map_bits(chunk)) + return -1; + + return bit_off; +} + +/* + * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() + * @map: the address to base the search on + * @size: the bitmap size in bits + * @start: the bitnumber to start searching at + * @nr: the number of zeroed bits we're looking for + * @align_mask: alignment mask for zero area + * @largest_off: offset of the largest area skipped + * @largest_bits: size of the largest area skipped + * + * The @align_mask should be one less than a power of 2. + * + * This is a modified version of bitmap_find_next_zero_area_off() to remember + * the largest area that was skipped. This is imperfect, but in general is + * good enough. The largest remembered region is the largest failed region + * seen. This does not include anything we possibly skipped due to alignment. + * pcpu_block_update_scan() does scan backwards to try and recover what was + * lost to alignment. While this can cause scanning to miss earlier possible + * free areas, smaller allocations will eventually fill those holes. + */ +static unsigned long pcpu_find_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned long nr, + unsigned long align_mask, + unsigned long *largest_off, + unsigned long *largest_bits) +{ + unsigned long index, end, i, area_off, area_bits; +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = __ALIGN_MASK(index, align_mask); + area_off = index; + + end = index + nr; + if (end > size) + return end; + i = find_next_bit(map, end, index); + if (i < end) { + area_bits = i - area_off; + /* remember largest unused area with best alignment */ + if (area_bits > *largest_bits || + (area_bits == *largest_bits && *largest_off && + (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { + *largest_off = area_off; + *largest_bits = area_bits; + } + + start = i + 1; + goto again; + } + return index; +} + +/** + * pcpu_alloc_area - allocates an area from a pcpu_chunk + * @chunk: chunk of interest + * @alloc_bits: size of request in allocation units + * @align: alignment of area (max PAGE_SIZE) + * @start: bit_off to start searching + * + * This function takes in a @start offset to begin searching to fit an + * allocation of @alloc_bits with alignment @align. It needs to scan + * the allocation map because if it fits within the block's contig hint, + * @start will be block->first_free. This is an attempt to fill the + * allocation prior to breaking the contig hint. The allocation and + * boundary maps are updated accordingly if it confirms a valid + * free area. + * + * RETURNS: + * Allocated addr offset in @chunk on success. + * -1 if no matching area is found. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, + size_t align, int start) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + size_t align_mask = (align) ? (align - 1) : 0; + unsigned long area_off = 0, area_bits = 0; + int bit_off, end, oslot; + + lockdep_assert_held(&pcpu_lock); + + oslot = pcpu_chunk_slot(chunk); + + /* + * Search to find a fit. + */ + end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, + pcpu_chunk_map_bits(chunk)); + bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, + align_mask, &area_off, &area_bits); + if (bit_off >= end) + return -1; + + if (area_bits) + pcpu_block_update_scan(chunk, area_off, area_bits); + + /* update alloc map */ + bitmap_set(chunk->alloc_map, bit_off, alloc_bits); + + /* update boundary map */ + set_bit(bit_off, chunk->bound_map); + bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); + set_bit(bit_off + alloc_bits, chunk->bound_map); + + chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; + + /* update first free bit */ + if (bit_off == chunk_md->first_free) + chunk_md->first_free = find_next_zero_bit( + chunk->alloc_map, + pcpu_chunk_map_bits(chunk), + bit_off + alloc_bits); + + pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); + + pcpu_chunk_relocate(chunk, oslot); + + return bit_off * PCPU_MIN_ALLOC_SIZE; +} + +/** + * pcpu_free_area - frees the corresponding offset + * @chunk: chunk of interest + * @off: addr offset into chunk + * + * This function determines the size of an allocation to free using + * the boundary bitmap and clears the allocation map. + * + * RETURNS: + * Number of freed bytes. + */ +static int pcpu_free_area(struct pcpu_chunk *chunk, int off) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int bit_off, bits, end, oslot, freed; + + lockdep_assert_held(&pcpu_lock); + pcpu_stats_area_dealloc(chunk); + + oslot = pcpu_chunk_slot(chunk); + + bit_off = off / PCPU_MIN_ALLOC_SIZE; + + /* find end index */ + end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), + bit_off + 1); + bits = end - bit_off; + bitmap_clear(chunk->alloc_map, bit_off, bits); + + freed = bits * PCPU_MIN_ALLOC_SIZE; + + /* update metadata */ + chunk->free_bytes += freed; + + /* update first free bit */ + chunk_md->first_free = min(chunk_md->first_free, bit_off); + + pcpu_block_update_hint_free(chunk, bit_off, bits); + + pcpu_chunk_relocate(chunk, oslot); + + return freed; +} + +static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) +{ + block->scan_hint = 0; + block->contig_hint = nr_bits; + block->left_free = nr_bits; + block->right_free = nr_bits; + block->first_free = 0; + block->nr_bits = nr_bits; +} + +static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) +{ + struct pcpu_block_md *md_block; + + /* init the chunk's block */ + pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); + + for (md_block = chunk->md_blocks; + md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); + md_block++) + pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); +} + +/** + * pcpu_alloc_first_chunk - creates chunks that serve the first chunk + * @tmp_addr: the start of the region served + * @map_size: size of the region served + * + * This is responsible for creating the chunks that serve the first chunk. The + * base_addr is page aligned down of @tmp_addr while the region end is page + * aligned up. Offsets are kept track of to determine the region served. All + * this is done to appease the bitmap allocator in avoiding partial blocks. + * + * RETURNS: + * Chunk serving the region at @tmp_addr of @map_size. + */ +static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, + int map_size) +{ + struct pcpu_chunk *chunk; + unsigned long aligned_addr, lcm_align; + int start_offset, offset_bits, region_size, region_bits; + size_t alloc_size; + + /* region calculations */ + aligned_addr = tmp_addr & PAGE_MASK; + + start_offset = tmp_addr - aligned_addr; + + /* + * Align the end of the region with the LCM of PAGE_SIZE and + * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of + * the other. + */ + lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); + region_size = ALIGN(start_offset + map_size, lcm_align); + + /* allocate chunk */ + alloc_size = struct_size(chunk, populated, + BITS_TO_LONGS(region_size >> PAGE_SHIFT)); + chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + INIT_LIST_HEAD(&chunk->list); + + chunk->base_addr = (void *)aligned_addr; + chunk->start_offset = start_offset; + chunk->end_offset = region_size - chunk->start_offset - map_size; + + chunk->nr_pages = region_size >> PAGE_SHIFT; + region_bits = pcpu_chunk_map_bits(chunk); + + alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); + chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->alloc_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = + BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); + chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->bound_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); + chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->md_blocks) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + +#ifdef CONFIG_MEMCG_KMEM + /* first chunk is free to use */ + chunk->obj_cgroups = NULL; +#endif + pcpu_init_md_blocks(chunk); + + /* manage populated page bitmap */ + chunk->immutable = true; + bitmap_fill(chunk->populated, chunk->nr_pages); + chunk->nr_populated = chunk->nr_pages; + chunk->nr_empty_pop_pages = chunk->nr_pages; + + chunk->free_bytes = map_size; + + if (chunk->start_offset) { + /* hide the beginning of the bitmap */ + offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, 0, offset_bits); + set_bit(0, chunk->bound_map); + set_bit(offset_bits, chunk->bound_map); + + chunk->chunk_md.first_free = offset_bits; + + pcpu_block_update_hint_alloc(chunk, 0, offset_bits); + } + + if (chunk->end_offset) { + /* hide the end of the bitmap */ + offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; + bitmap_set(chunk->alloc_map, + pcpu_chunk_map_bits(chunk) - offset_bits, + offset_bits); + set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, + chunk->bound_map); + set_bit(region_bits, chunk->bound_map); + + pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) + - offset_bits, offset_bits); + } + + return chunk; +} + +static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) +{ + struct pcpu_chunk *chunk; + int region_bits; + + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); + if (!chunk) + return NULL; + + INIT_LIST_HEAD(&chunk->list); + chunk->nr_pages = pcpu_unit_pages; + region_bits = pcpu_chunk_map_bits(chunk); + + chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * + sizeof(chunk->alloc_map[0]), gfp); + if (!chunk->alloc_map) + goto alloc_map_fail; + + chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * + sizeof(chunk->bound_map[0]), gfp); + if (!chunk->bound_map) + goto bound_map_fail; + + chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * + sizeof(chunk->md_blocks[0]), gfp); + if (!chunk->md_blocks) + goto md_blocks_fail; + +#ifdef CONFIG_MEMCG_KMEM + if (!mem_cgroup_kmem_disabled()) { + chunk->obj_cgroups = + pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * + sizeof(struct obj_cgroup *), gfp); + if (!chunk->obj_cgroups) + goto objcg_fail; + } +#endif + + pcpu_init_md_blocks(chunk); + + /* init metadata */ + chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; + + return chunk; + +#ifdef CONFIG_MEMCG_KMEM +objcg_fail: + pcpu_mem_free(chunk->md_blocks); +#endif +md_blocks_fail: + pcpu_mem_free(chunk->bound_map); +bound_map_fail: + pcpu_mem_free(chunk->alloc_map); +alloc_map_fail: + pcpu_mem_free(chunk); + + return NULL; +} + +static void pcpu_free_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; +#ifdef CONFIG_MEMCG_KMEM + pcpu_mem_free(chunk->obj_cgroups); +#endif + pcpu_mem_free(chunk->md_blocks); + pcpu_mem_free(chunk->bound_map); + pcpu_mem_free(chunk->alloc_map); + pcpu_mem_free(chunk); +} + +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, + int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_populated += nr; + + pcpu_update_empty_pages(chunk, nr); +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_populated -= nr; + + pcpu_update_empty_pages(chunk, -nr); +} + +/* + * Chunk management implementation. + * + * To allow different implementations, chunk alloc/free and + * [de]population are implemented in a separate file which is pulled + * into this file and compiled together. The following functions + * should be implemented. + * + * pcpu_populate_chunk - populate the specified range of a chunk + * pcpu_depopulate_chunk - depopulate the specified range of a chunk + * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk + * pcpu_create_chunk - create a new chunk + * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop + * pcpu_addr_to_page - translate address to physical address + * pcpu_verify_alloc_info - check alloc_info is acceptable during init + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end, gfp_t gfp); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end); +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); +static struct page *pcpu_addr_to_page(void *addr); +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); + +#ifdef CONFIG_NEED_PER_CPU_KM +#include "percpu-km.c" +#else +#include "percpu-vm.c" +#endif + +/** + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. + * + * This is an internal function that handles all but static allocations. + * Static percpu address values should never be passed into the allocator. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + /* is it in the dynamic region (first chunk)? */ + if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) + return pcpu_first_chunk; + + /* is it in the reserved region? */ + if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) + return pcpu_reserved_chunk; + + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_offsets[raw_smp_processor_id()]; + return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); +} + +#ifdef CONFIG_MEMCG_KMEM +static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, + struct obj_cgroup **objcgp) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) + return true; + + objcg = get_obj_cgroup_from_current(); + if (!objcg) + return true; + + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { + obj_cgroup_put(objcg); + return false; + } + + *objcgp = objcg; + return true; +} + +static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, + struct pcpu_chunk *chunk, int off, + size_t size) +{ + if (!objcg) + return; + + if (likely(chunk && chunk->obj_cgroups)) { + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + pcpu_obj_full_size(size)); + rcu_read_unlock(); + } else { + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); + obj_cgroup_put(objcg); + } +} + +static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ + struct obj_cgroup *objcg; + + if (unlikely(!chunk->obj_cgroups)) + return; + + objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; + if (!objcg) + return; + chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; + + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, + -pcpu_obj_full_size(size)); + rcu_read_unlock(); + + obj_cgroup_put(objcg); +} + +#else /* CONFIG_MEMCG_KMEM */ +static bool +pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) +{ + return true; +} + +static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, + struct pcpu_chunk *chunk, int off, + size_t size) +{ +} + +static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +/** + * pcpu_alloc - the percpu allocator + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags + * + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN + * then no warning will be triggered on invalid or failed allocation + * requests. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) +{ + gfp_t pcpu_gfp; + bool is_atomic; + bool do_warn; + struct obj_cgroup *objcg = NULL; + static int warn_limit = 10; + struct pcpu_chunk *chunk, *next; + const char *err; + int slot, off, cpu, ret; + unsigned long flags; + void __percpu *ptr; + size_t bits, bit_align; + + gfp = current_gfp_context(gfp); + /* whitelisted flags that can be passed to the backing allocators */ + pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); + is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + do_warn = !(gfp & __GFP_NOWARN); + + /* + * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, + * therefore alignment must be a minimum of that many bytes. + * An allocation may have internal fragmentation from rounding up + * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. + */ + if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) + align = PCPU_MIN_ALLOC_SIZE; + + size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); + bits = size >> PCPU_MIN_ALLOC_SHIFT; + bit_align = align >> PCPU_MIN_ALLOC_SHIFT; + + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || + !is_power_of_2(align))) { + WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n", + size, align); + return NULL; + } + + if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) + return NULL; + + if (!is_atomic) { + /* + * pcpu_balance_workfn() allocates memory under this mutex, + * and it may wait for memory reclaim. Allow current task + * to become OOM victim, in case of memory pressure. + */ + if (gfp & __GFP_NOFAIL) { + mutex_lock(&pcpu_alloc_mutex); + } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { + pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); + return NULL; + } + } + + spin_lock_irqsave(&pcpu_lock, flags); + + /* serve reserved allocations from the reserved chunk if available */ + if (reserved && pcpu_reserved_chunk) { + chunk = pcpu_reserved_chunk; + + off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); + if (off < 0) { + err = "alloc from reserved chunk failed"; + goto fail_unlock; + } + + off = pcpu_alloc_area(chunk, bits, bit_align, off); + if (off >= 0) + goto area_found; + + err = "alloc from reserved chunk failed"; + goto fail_unlock; + } + +restart: + /* search through normal chunks */ + for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { + list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot], + list) { + off = pcpu_find_block_fit(chunk, bits, bit_align, + is_atomic); + if (off < 0) { + if (slot < PCPU_SLOT_FAIL_THRESHOLD) + pcpu_chunk_move(chunk, 0); + continue; + } + + off = pcpu_alloc_area(chunk, bits, bit_align, off); + if (off >= 0) { + pcpu_reintegrate_chunk(chunk); + goto area_found; + } + } + } + + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + if (is_atomic) { + err = "atomic alloc failed, no space left"; + goto fail; + } + + if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { + chunk = pcpu_create_chunk(pcpu_gfp); + if (!chunk) { + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); + } + + goto restart; + +area_found: + pcpu_stats_area_alloc(chunk, size); + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* populate if not all pages are already there */ + if (!is_atomic) { + unsigned int page_end, rs, re; + + rs = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + pcpu_free_area(chunk, off); + err = "failed to populate"; + goto fail_unlock; + } + pcpu_chunk_populated(chunk, rs, re); + spin_unlock_irqrestore(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); + } + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); + kmemleak_alloc_percpu(ptr, size, gfp); + + trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align, + chunk->base_addr, off, ptr, + pcpu_obj_full_size(size), gfp); + + pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); + + return ptr; + +fail_unlock: + spin_unlock_irqrestore(&pcpu_lock, flags); +fail: + trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); + + if (!is_atomic && do_warn && warn_limit) { + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); + dump_stack(); + if (!--warn_limit) + pr_info("limit reached, disable warning\n"); + } + if (is_atomic) { + /* see the flag handling in pcpu_balance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } else { + mutex_unlock(&pcpu_alloc_mutex); + } + + pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); + + return NULL; +} + +/** + * __alloc_percpu_gfp - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags + * + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. If @gfp + * has __GFP_NOWARN then no warning will be triggered on invalid or failed + * allocation requests. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ +void __percpu *__alloc_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, false, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +/** + * __alloc_reserved_percpu - allocate reserved percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate zero-filled percpu area of @size bytes aligned at @align + * from reserved percpu area if arch has set it up; otherwise, + * allocation is served from the same dynamic area. Might sleep. + * Might trigger writeouts. + * + * CONTEXT: + * Does GFP_KERNEL allocation. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void __percpu *__alloc_reserved_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, true, GFP_KERNEL); +} + +/** + * pcpu_balance_free - manage the amount of free chunks + * @empty_only: free chunks only if there are no populated pages + * + * If empty_only is %false, reclaim all fully free chunks regardless of the + * number of populated pages. Otherwise, only reclaim chunks that have no + * populated pages. + * + * CONTEXT: + * pcpu_lock (can be dropped temporarily) + */ +static void pcpu_balance_free(bool empty_only) +{ + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot]; + struct pcpu_chunk *chunk, *next; + + lockdep_assert_held(&pcpu_lock); + + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ + list_for_each_entry_safe(chunk, next, free_head, list) { + WARN_ON(chunk->immutable); + + /* spare the first one */ + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) + continue; + + if (!empty_only || chunk->nr_empty_pop_pages == 0) + list_move(&chunk->list, &to_free); + } + + if (list_empty(&to_free)) + return; + + spin_unlock_irq(&pcpu_lock); + list_for_each_entry_safe(chunk, next, &to_free, list) { + unsigned int rs, re; + + for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); + } + pcpu_destroy_chunk(chunk); + cond_resched(); + } + spin_lock_irq(&pcpu_lock); +} + +/** + * pcpu_balance_populated - manage the amount of populated pages + * + * Maintain a certain amount of populated pages to satisfy atomic allocations. + * It is possible that this is called when physical memory is scarce causing + * OOM killer to be triggered. We should avoid doing so until an actual + * allocation causes the failure as it is possible that requests can be + * serviced from already backed regions. + * + * CONTEXT: + * pcpu_lock (can be dropped temporarily) + */ +static void pcpu_balance_populated(void) +{ + /* gfp flags passed to underlying allocators */ + const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + struct pcpu_chunk *chunk; + int slot, nr_to_pop, ret; + + lockdep_assert_held(&pcpu_lock); + + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) { + unsigned int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) { + nr_unpop = chunk->nr_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { + int nr = min_t(int, re - rs, nr_to_pop); + + spin_unlock_irq(&pcpu_lock); + ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); + cond_resched(); + spin_lock_irq(&pcpu_lock); + if (!ret) { + nr_to_pop -= nr; + pcpu_chunk_populated(chunk, rs, rs + nr); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + spin_unlock_irq(&pcpu_lock); + chunk = pcpu_create_chunk(gfp); + cond_resched(); + spin_lock_irq(&pcpu_lock); + if (chunk) { + pcpu_chunk_relocate(chunk, -1); + goto retry_pop; + } + } +} + +/** + * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages + * + * Scan over chunks in the depopulate list and try to release unused populated + * pages back to the system. Depopulated chunks are sidelined to prevent + * repopulating these pages unless required. Fully free chunks are reintegrated + * and freed accordingly (1 is kept around). If we drop below the empty + * populated pages threshold, reintegrate the chunk if it has empty free pages. + * Each chunk is scanned in the reverse order to keep populated pages close to + * the beginning of the chunk. + * + * CONTEXT: + * pcpu_lock (can be dropped temporarily) + * + */ +static void pcpu_reclaim_populated(void) +{ + struct pcpu_chunk *chunk; + struct pcpu_block_md *block; + int freed_page_start, freed_page_end; + int i, end; + bool reintegrate; + + lockdep_assert_held(&pcpu_lock); + + /* + * Once a chunk is isolated to the to_depopulate list, the chunk is no + * longer discoverable to allocations whom may populate pages. The only + * other accessor is the free path which only returns area back to the + * allocator not touching the populated bitmap. + */ + while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) { + chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot], + struct pcpu_chunk, list); + WARN_ON(chunk->immutable); + + /* + * Scan chunk's pages in the reverse order to keep populated + * pages close to the beginning of the chunk. + */ + freed_page_start = chunk->nr_pages; + freed_page_end = 0; + reintegrate = false; + for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { + /* no more work to do */ + if (chunk->nr_empty_pop_pages == 0) + break; + + /* reintegrate chunk to prevent atomic alloc failures */ + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { + reintegrate = true; + goto end_chunk; + } + + /* + * If the page is empty and populated, start or + * extend the (i, end) range. If i == 0, decrease + * i and perform the depopulation to cover the last + * (first) page in the chunk. + */ + block = chunk->md_blocks + i; + if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS && + test_bit(i, chunk->populated)) { + if (end == -1) + end = i; + if (i > 0) + continue; + i--; + } + + /* depopulate if there is an active range */ + if (end == -1) + continue; + + spin_unlock_irq(&pcpu_lock); + pcpu_depopulate_chunk(chunk, i + 1, end + 1); + cond_resched(); + spin_lock_irq(&pcpu_lock); + + pcpu_chunk_depopulated(chunk, i + 1, end + 1); + freed_page_start = min(freed_page_start, i + 1); + freed_page_end = max(freed_page_end, end + 1); + + /* reset the range and continue */ + end = -1; + } + +end_chunk: + /* batch tlb flush per chunk to amortize cost */ + if (freed_page_start < freed_page_end) { + spin_unlock_irq(&pcpu_lock); + pcpu_post_unmap_tlb_flush(chunk, + freed_page_start, + freed_page_end); + cond_resched(); + spin_lock_irq(&pcpu_lock); + } + + if (reintegrate || chunk->free_bytes == pcpu_unit_size) + pcpu_reintegrate_chunk(chunk); + else + list_move_tail(&chunk->list, + &pcpu_chunk_lists[pcpu_sidelined_slot]); + } +} + +/** + * pcpu_balance_workfn - manage the amount of free chunks and populated pages + * @work: unused + * + * For each chunk type, manage the number of fully free chunks and the number of + * populated pages. An important thing to consider is when pages are freed and + * how they contribute to the global counts. + */ +static void pcpu_balance_workfn(struct work_struct *work) +{ + /* + * pcpu_balance_free() is called twice because the first time we may + * trim pages in the active pcpu_nr_empty_pop_pages which may cause us + * to grow other chunks. This then gives pcpu_reclaim_populated() time + * to move fully free chunks to the active list to be freed if + * appropriate. + */ + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irq(&pcpu_lock); + + pcpu_balance_free(false); + pcpu_reclaim_populated(); + pcpu_balance_populated(); + pcpu_balance_free(true); + + spin_unlock_irq(&pcpu_lock); + mutex_unlock(&pcpu_alloc_mutex); +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. + * + * CONTEXT: + * Can be called from atomic context. + */ +void free_percpu(void __percpu *ptr) +{ + void *addr; + struct pcpu_chunk *chunk; + unsigned long flags; + int size, off; + bool need_balance = false; + + if (!ptr) + return; + + kmemleak_free_percpu(ptr); + + addr = __pcpu_ptr_to_addr(ptr); + + spin_lock_irqsave(&pcpu_lock, flags); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->base_addr; + + size = pcpu_free_area(chunk, off); + + pcpu_memcg_free_hook(chunk, off, size); + + /* + * If there are more than one fully free chunks, wake up grim reaper. + * If the chunk is isolated, it may be in the process of being + * reclaimed. Let reclaim manage cleaning up of that chunk. + */ + if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list) + if (pos != chunk) { + need_balance = true; + break; + } + } else if (pcpu_should_reclaim_chunk(chunk)) { + pcpu_isolate_chunk(chunk); + need_balance = true; + } + + trace_percpu_free_percpu(chunk->base_addr, off, ptr); + + spin_unlock_irqrestore(&pcpu_lock, flags); + + if (need_balance) + pcpu_schedule_balance_work(); +} +EXPORT_SYMBOL_GPL(free_percpu); + +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) +{ +#ifdef CONFIG_SMP + const size_t static_size = __per_cpu_end - __per_cpu_start; + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + unsigned int cpu; + + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + void *va = (void *)addr; + + if (va >= start && va < start + static_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(base, get_boot_cpu_id()); + } + return true; + } + } +#endif + /* on UP, can't distinguish from other static vars, always false */ + return false; +} + +/** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ + return __is_kernel_percpu_address(addr, NULL); +} + +/** + * per_cpu_ptr_to_phys - convert translated percpu address to physical address + * @addr: the address to be converted to physical address + * + * Given @addr which is dereferenceable address obtained via one of + * percpu access macros, this function translates it into its physical + * address. The caller is responsible for ensuring @addr stays valid + * until this function finishes. + * + * percpu allocator has special setup for the first chunk, which currently + * supports either embedding in linear address space or vmalloc mapping, + * and, from the second one, the backing allocator (currently either vm or + * km) provides translation. + * + * The addr can be translated simply without checking if it falls into the + * first chunk. But the current code reflects better how percpu allocator + * actually works, and the verification can discover both bugs in percpu + * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current + * code. + * + * RETURNS: + * The physical address for @addr. + */ +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + bool in_first_chunk = false; + unsigned long first_low, first_high; + unsigned int cpu; + + /* + * The following test on unit_low/high isn't strictly + * necessary but will speed up lookups of addresses which + * aren't in the first chunk. + * + * The address check is against full chunk sizes. pcpu_base_addr + * points to the beginning of the first chunk including the + * static region. Assumes good intent as the first chunk may + * not be full (ie. < pcpu_unit_pages in size). + */ + first_low = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); + first_high = (unsigned long)pcpu_base_addr + + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); + if ((unsigned long)addr >= first_low && + (unsigned long)addr < first_high) { + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if (addr >= start && addr < start + pcpu_unit_size) { + in_first_chunk = true; + break; + } + } + } + + if (in_first_chunk) { + if (!is_vmalloc_addr(addr)) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)) + + offset_in_page(addr); + } else + return page_to_phys(pcpu_addr_to_page(addr)) + + offset_in_page(addr); +} + +/** + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(struct_size(ai, groups, nr_groups), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + memblock_free(ai, ai->__ai_size); +} + +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) +{ + int group_width = 1, cpu_width = 1, width; + char empty_str[] = "--------"; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; + + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); + + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { + pr_cont("\n"); + printk("%spcpu-alloc: ", lvl); + } + pr_cont("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + pr_cont("%0*d ", + cpu_width, gi->cpu_map[unit]); + else + pr_cont("%s ", empty_str); + } + } + pr_cont("\n"); +} + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @ai: pcpu_alloc_info describing how to percpu area is shaped + * @base_addr: mapped address + * + * Initialize the first percpu chunk which contains the kernel static + * percpu area. This function is to be called from arch percpu area + * setup path. + * + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to + * reserve after the static area in the first chunk. This reserves + * the first chunk such that it's available only through reserved + * percpu allocation. This is primarily used to serve module percpu + * static areas on architectures where the addressing model has + * limited offset range for symbol relocations to guarantee module + * percpu symbols fall inside the relocatable range. + * + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. + * + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. + * + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. + * + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. + * + * The first chunk will always contain a static and a dynamic region. + * However, the static region is not managed by any chunk. If the first + * chunk also contains a reserved region, it is served by two chunks - + * one for the reserved region and one for the dynamic region. They + * share the same vm, but use offset regions in the area allocation map. + * The chunk serving the dynamic region is circulated in the chunk slots + * and available for dynamic allocation like any other chunk. + */ +void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) +{ + size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + size_t static_size, dyn_size; + struct pcpu_chunk *chunk; + unsigned long *group_offsets; + size_t *group_sizes; + unsigned long *unit_off; + unsigned int cpu; + int *unit_map; + int group, unit, i; + int map_size; + unsigned long tmp_addr; + size_t alloc_size; + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("failed to initialize, %s\n", #cond); \ + pr_emerg("cpu_possible_mask=%*pb\n", \ + cpumask_pr_args(cpu_possible_mask)); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + + /* sanity checks */ + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); +#ifdef CONFIG_SMP + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); +#endif + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); + PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(!ai->dyn_size); + PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); + PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || + IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); + PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); + + /* process group information and build config tables accordingly */ + alloc_size = ai->nr_groups * sizeof(group_offsets[0]); + group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_offsets) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = ai->nr_groups * sizeof(group_sizes[0]); + group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_sizes) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = nr_cpu_ids * sizeof(unit_map[0]); + unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!unit_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = nr_cpu_ids * sizeof(unit_off[0]); + unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!unit_off) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = UINT_MAX; + + pcpu_low_unit_cpu = NR_CPUS; + pcpu_high_unit_cpu = NR_CPUS; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; + + PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); + + unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + + /* determine low/high unit_cpu */ + if (pcpu_low_unit_cpu == NR_CPUS || + unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) + pcpu_low_unit_cpu = cpu; + if (pcpu_high_unit_cpu == NR_CPUS || + unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) + pcpu_high_unit_cpu = cpu; + } + } + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_DEBUG, ai); + + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; + pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; + + /* determine basic parameters */ + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_atom_size = ai->atom_size; + pcpu_chunk_struct_size = struct_size(chunk, populated, + BITS_TO_LONGS(pcpu_unit_pages)); + + pcpu_stats_save_ai(ai); + + /* + * Allocate chunk slots. The slots after the active slots are: + * sidelined_slot - isolated, depopulated chunks + * free_slot - fully free chunks + * to_depopulate_slot - isolated, chunks to depopulate + */ + pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_free_slot = pcpu_sidelined_slot + 1; + pcpu_to_depopulate_slot = pcpu_free_slot + 1; + pcpu_nr_slots = pcpu_to_depopulate_slot + 1; + pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + sizeof(pcpu_chunk_lists[0]), + SMP_CACHE_BYTES); + if (!pcpu_chunk_lists) + panic("%s: Failed to allocate %zu bytes\n", __func__, + pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); + + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_chunk_lists[i]); + + /* + * The end of the static region needs to be aligned with the + * minimum allocation size as this offsets the reserved and + * dynamic region. The first chunk ends page aligned by + * expanding the dynamic region, therefore the dynamic region + * can be shrunk to compensate while still staying above the + * configured sizes. + */ + static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); + dyn_size = ai->dyn_size - (static_size - ai->static_size); + + /* + * Initialize first chunk. + * If the reserved_size is non-zero, this initializes the reserved + * chunk. If the reserved_size is zero, the reserved chunk is NULL + * and the dynamic region is initialized here. The first chunk, + * pcpu_first_chunk, will always point to the chunk that serves + * the dynamic region. + */ + tmp_addr = (unsigned long)base_addr + static_size; + map_size = ai->reserved_size ?: dyn_size; + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); + + /* init dynamic chunk if necessary */ + if (ai->reserved_size) { + pcpu_reserved_chunk = chunk; + + tmp_addr = (unsigned long)base_addr + static_size + + ai->reserved_size; + map_size = dyn_size; + chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); + } + + /* link the first chunk in */ + pcpu_first_chunk = chunk; + pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; + pcpu_chunk_relocate(pcpu_first_chunk, -1); + + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(base_addr); + + /* we're done */ + pcpu_base_addr = base_addr; +} + +#ifdef CONFIG_SMP + +const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + else + pr_warn("unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + +/* + * pcpu_embed_first_chunk() is used by the generic percpu setup. + * Build it if needed by the arch config or the generic setup is going + * to be used. + */ +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) +#define BUILD_EMBED_FIRST_CHUNK +#endif + +/* build pcpu_page_first_chunk() iff needed by the arch config */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#define BUILD_PAGE_FIRST_CHUNK +#endif + +/* pcpu_build_alloc_info() is used by both embed and page first chunk */ +#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always multiples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + static struct cpumask mask __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, best_upa; /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + cpumask_clear(&mask); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accommodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + /* determine the maximum # of units that can fit in an allocation */ + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || (offset_in_page(alloc_size / upa))) + upa--; + max_upa = upa; + + cpumask_copy(&mask, cpu_possible_mask); + + /* group cpus according to their proximity */ + for (group = 0; !cpumask_empty(&mask); group++) { + /* pop the group's first cpu */ + cpu = cpumask_first(&mask); + group_map[cpu] = group; + group_cnt[group]++; + cpumask_clear_cpu(cpu, &mask); + + for_each_cpu(tcpu, &mask) { + if (!cpu_distance_fn || + (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE && + cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) { + group_map[tcpu] = group; + group_cnt[group]++; + cpumask_clear_cpu(tcpu, &mask); + } + } + } + nr_groups = group; + + /* + * Wasted space is caused by a ratio imbalance of upa to group_cnt. + * Expand the unit_size until we use >= 75% of the units allocated. + * Related to atom_size, which could be much larger than the unit_size. + */ + last_allocs = INT_MAX; + best_upa = 0; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || (offset_in_page(alloc_size / upa))) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 1/3. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + BUG_ON(!best_upa); + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group < nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NUMA + int node = NUMA_NO_NODE; + void *ptr; + + if (cpu_to_nd_fn) + node = cpu_to_nd_fn(cpu); + + if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { + ptr = memblock_alloc_from(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", + cpu, size, (u64)__pa(ptr)); + } else { + ptr = memblock_alloc_try_nid(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, + node); + + pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", + cpu, size, node, (u64)__pa(ptr)); + } + return ptr; +#else + return memblock_alloc_from(size, align, goal); +#endif +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + memblock_free(ptr, size); +} +#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ + +#if defined(BUILD_EMBED_FIRST_CHUNK) +/** + * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * If this function is used to setup the first chunk, it is allocated + * by calling pcpu_fc_alloc and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). + * + * @dyn_size specifies the minimum dynamic area size. + * + * If the needed size is smaller than the minimum or specified unit + * size, the leftover is returned using pcpu_fc_free. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) +{ + void *base = (void *)ULONG_MAX; + void **areas = NULL; + struct pcpu_alloc_info *ai; + size_t size_sum, areas_size; + unsigned long max_distance; + int group, i, highest_group, rc = 0; + + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); + if (IS_ERR(ai)) + return PTR_ERR(ai); + + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); + + areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); + if (!areas) { + rc = -ENOMEM; + goto out_free; + } + + /* allocate, copy and determine base address & max_distance */ + highest_group = 0; + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + /* kmemleak tracks the percpu allocations separately */ + kmemleak_ignore_phys(__pa(ptr)); + areas[group] = ptr; + + base = min(ptr, base); + if (ptr > areas[highest_group]) + highest_group = group; + } + max_distance = areas[highest_group] - base; + max_distance += ai->unit_size * ai->groups[highest_group].nr_units; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > VMALLOC_TOTAL * 3 / 4) { + pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", + max_distance, VMALLOC_TOTAL); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free_areas; +#endif + } + + /* + * Copy data and free unused parts. This should happen after all + * allocations are complete; otherwise, we may end up with + * overlapping groups. + */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + void *ptr = areas[group]; + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + pcpu_fc_free(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); + } + } + + /* base address is now known, determine group base offsets */ + for (group = 0; group < ai->nr_groups; group++) { + ai->groups[group].base_offset = areas[group] - base; + } + + pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); + + pcpu_setup_first_chunk(ai, base); + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + if (areas[group]) + pcpu_fc_free(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: + pcpu_free_alloc_info(ai); + if (areas) + memblock_free(areas, areas_size); + return rc; +} +#endif /* BUILD_EMBED_FIRST_CHUNK */ + +#ifdef BUILD_PAGE_FIRST_CHUNK +#include + +#ifndef P4D_TABLE_SIZE +#define P4D_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PUD_TABLE_SIZE +#define PUD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PMD_TABLE_SIZE +#define PMD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PTE_TABLE_SIZE +#define PTE_TABLE_SIZE PAGE_SIZE +#endif +void __init __weak pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (pgd_none(*pgd)) { + p4d_t *new; + + new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!new) + goto err_alloc; + pgd_populate(&init_mm, pgd, new); + } + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate memory\n", __func__); +} + +/** + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages + * @reserved_size: the size of reserved percpu area in bytes + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional + * + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) +{ + static struct vm_struct vm; + struct pcpu_alloc_info *ai; + char psize_str[16]; + int unit_pages; + size_t pages_size; + struct page **pages; + int unit, i, j, rc = 0; + int upa; + int nr_g0_units; + + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + + ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + upa = ai->alloc_size/ai->unit_size; + nr_g0_units = roundup(num_possible_cpus(), upa); + if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { + pcpu_free_alloc_info(ai); + return -EINVAL; + } + + unit_pages = ai->unit_size >> PAGE_SHIFT; + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); + if (!pages) + panic("%s: Failed to allocate %zu bytes\n", __func__, + pages_size); + + /* allocate pages */ + j = 0; + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; + for (i = 0; i < unit_pages; i++) { + void *ptr; + + ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); + if (!ptr) { + pr_warn("failed to allocate %s page for cpu%u\n", + psize_str, cpu); + goto enomem; + } + /* kmemleak tracks the percpu allocations separately */ + kmemleak_ignore_phys(__pa(ptr)); + pages[j++] = virt_to_page(ptr); + } + } + + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * ai->unit_size; + vm_area_register_early(&vm, PAGE_SIZE); + + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned long unit_addr = + (unsigned long)vm.addr + unit * ai->unit_size; + + for (i = 0; i < unit_pages; i++) + pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); + + /* pte already populated, the following shouldn't fail */ + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); + + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); + } + + /* we're ready, commit */ + pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", + unit_pages, psize_str, ai->static_size, + ai->reserved_size, ai->dyn_size); + + pcpu_setup_first_chunk(ai, vm.addr); + goto out_free_ar; + +enomem: + while (--j >= 0) + pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); + rc = -ENOMEM; +out_free_ar: + memblock_free(pages, pages_size); + pcpu_free_alloc_info(ai); + return rc; +} +#endif /* BUILD_PAGE_FIRST_CHUNK */ + +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +/* + * Generic SMP percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, + PAGE_SIZE, NULL, NULL); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#else /* CONFIG_SMP */ + +/* + * UP percpu area setup. + * + * UP always uses km-based percpu allocator with identity mapping. + * Static percpu variables are indistinguishable from the usual static + * variables and don't require any special preparation. + */ +void __init setup_per_cpu_areas(void) +{ + const size_t unit_size = + roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, + PERCPU_DYNAMIC_RESERVE)); + struct pcpu_alloc_info *ai; + void *fc; + + ai = pcpu_alloc_alloc_info(1, 1); + fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!ai || !fc) + panic("Failed to allocate memory for percpu areas."); + /* kmemleak tracks the percpu allocations separately */ + kmemleak_ignore_phys(__pa(fc)); + + ai->dyn_size = unit_size; + ai->unit_size = unit_size; + ai->atom_size = unit_size; + ai->alloc_size = unit_size; + ai->groups[0].nr_units = 1; + ai->groups[0].cpu_map[0] = 0; + + pcpu_setup_first_chunk(ai, fc); + pcpu_free_alloc_info(ai); +} + +#endif /* CONFIG_SMP */ + +/* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h new file mode 100644 index 000000000..e9e879de8 --- /dev/null +++ b/mm/pgalloc-track.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PGALLOC_TRACK_H +#define _LINUX_PGALLOC_TRACK_H + +#if defined(CONFIG_MMU) +static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(pgd_none(*pgd))) { + if (__p4d_alloc(mm, pgd, address)) + return NULL; + *mod_mask |= PGTBL_PGD_MODIFIED; + } + + return p4d_offset(pgd, address); +} + +static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(p4d_none(*p4d))) { + if (__pud_alloc(mm, p4d, address)) + return NULL; + *mod_mask |= PGTBL_P4D_MODIFIED; + } + + return pud_offset(p4d, address); +} + +static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(pud_none(*pud))) { + if (__pmd_alloc(mm, pud, address)) + return NULL; + *mod_mask |= PGTBL_PUD_MODIFIED; + } + + return pmd_offset(pud, address); +} +#endif /* CONFIG_MMU */ + +#define pte_alloc_kernel_track(pmd, address, mask) \ + ((unlikely(pmd_none(*(pmd))) && \ + (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ + NULL: pte_offset_kernel(pmd, address)) + +#endif /* _LINUX_PGALLOC_TRACK_H */ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 000000000..90ab721a1 --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in linux/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include +#include +#include +#include +#include + +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +#ifndef __PAGETABLE_P4D_FOLDED +void p4d_clear_bad(p4d_t *p4d) +{ + p4d_ERROR(*p4d); + p4d_clear(p4d); +} +#endif + +#ifndef __PAGETABLE_PUD_FOLDED +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} +#endif + +/* + * Note that the pmd variant below can't be stub'ed out just as for p4d/pud + * above. pmd folding is special and typically pmd_* macros refer to upper + * level even when folded + */ +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed), as well as write + * permission. Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_fix_spurious_fault(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + struct mm_struct *mm = (vma)->vm_mm; + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + if (pte_accessible(mm, pte)) + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH +pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} + +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + pud_t pud; + + VM_BUG_ON(address & ~HPAGE_PUD_MASK); + VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp); + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return pud; +} +#endif +#endif + +#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; +} +#endif + +#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, + struct page, lru); + if (pmd_huge_pte(mm, pmdp)) + list_del(&pgtable->lru); + return pgtable; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return old; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + return pmdp_invalidate(vma, address, pmdp); +} +#endif + +#ifndef pmdp_collapse_flush +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * pmd and hugepage pte format are same. So we could + * use the same function. + */ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + + /* collapse entails shooting down ptes not pmd */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 000000000..78dfaf9e8 --- /dev/null +++ b/mm/process_vm_access.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * linux/mm/process_vm_access.c + * + * Copyright (C) 2010-2011 Christopher Yeoh , IBM Corp. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * process_vm_rw_pages - read/write pages from task specified + * @pages: array of pointers to pages we want to copy + * @offset: offset in page to start copying from/to + * @len: number of bytes to copy + * @iter: where to copy to/from locally + * @vm_write: 0 means copy from, 1 means copy to + * Returns 0 on success, error code otherwise + */ +static int process_vm_rw_pages(struct page **pages, + unsigned offset, + size_t len, + struct iov_iter *iter, + int vm_write) +{ + /* Do the copy for each page */ + while (len && iov_iter_count(iter)) { + struct page *page = *pages++; + size_t copy = PAGE_SIZE - offset; + size_t copied; + + if (copy > len) + copy = len; + + if (vm_write) + copied = copy_page_from_iter(page, offset, copy, iter); + else + copied = copy_page_to_iter(page, offset, copy, iter); + + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; + } + return 0; +} + +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) + +/** + * process_vm_rw_single_vec - read/write pages from task specified + * @addr: start memory address of target process + * @len: size of area to copy to/from + * @iter: where to copy to/from locally + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @mm: mm for task + * @task: task to read/write from + * @vm_write: 0 means copy from, 1 means copy to + * Returns 0 on success or on failure error code + */ +static int process_vm_rw_single_vec(unsigned long addr, + unsigned long len, + struct iov_iter *iter, + struct page **process_pages, + struct mm_struct *mm, + struct task_struct *task, + int vm_write) +{ + unsigned long pa = addr & PAGE_MASK; + unsigned long start_offset = addr - pa; + unsigned long nr_pages; + ssize_t rc = 0; + unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES + / sizeof(struct pages *); + unsigned int flags = 0; + + /* Work out address and page range required */ + if (len == 0) + return 0; + nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + + if (vm_write) + flags |= FOLL_WRITE; + + while (!rc && nr_pages && iov_iter_count(iter)) { + int pinned_pages = min(nr_pages, max_pages_per_loop); + int locked = 1; + size_t bytes; + + /* + * Get the pages we're interested in. We must + * access remotely because task/mm might not + * current/current->mm + */ + mmap_read_lock(mm); + pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, + flags, process_pages, + NULL, &locked); + if (locked) + mmap_read_unlock(mm); + if (pinned_pages <= 0) + return -EFAULT; + + bytes = pinned_pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; + + rc = process_vm_rw_pages(process_pages, + start_offset, bytes, iter, + vm_write); + len -= bytes; + start_offset = 0; + nr_pages -= pinned_pages; + pa += pinned_pages * PAGE_SIZE; + + /* If vm_write is set, the pages need to be made dirty: */ + unpin_user_pages_dirty_lock(process_pages, pinned_pages, + vm_write); + } + + return rc; +} + +/* Maximum number of entries for process pages array + which lives on stack */ +#define PVM_MAX_PP_ARRAY_COUNT 16 + +/** + * process_vm_rw_core - core of reading/writing pages from task specified + * @pid: PID of process to read/write from/to + * @iter: where to copy to/from locally + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct task_struct *task; + struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; + struct page **process_pages = pp_stack; + struct mm_struct *mm; + unsigned long i; + ssize_t rc = 0; + unsigned long nr_pages = 0; + unsigned long nr_pages_iov; + ssize_t iov_len; + size_t total_len = iov_iter_count(iter); + + /* + * Work out how many pages of struct pages we're going to need + * when eventually calling get_user_pages + */ + for (i = 0; i < riovcnt; i++) { + iov_len = rvec[i].iov_len; + if (iov_len > 0) { + nr_pages_iov = ((unsigned long)rvec[i].iov_base + + iov_len) + / PAGE_SIZE - (unsigned long)rvec[i].iov_base + / PAGE_SIZE + 1; + nr_pages = max(nr_pages, nr_pages_iov); + } + } + + if (nr_pages == 0) + return 0; + + if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { + /* For reliability don't try to kmalloc more than + 2 pages worth */ + process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nr_pages), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + /* Get process information */ + task = find_get_task_by_vpid(pid); + if (!task) { + rc = -ESRCH; + goto free_proc_pages; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); + if (!mm || IS_ERR(mm)) { + rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + /* + * Explicitly map EACCES to EPERM as EPERM is a more + * appropriate error code for process_vw_readv/writev + */ + if (rc == -EACCES) + rc = -EPERM; + goto put_task_struct; + } + + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) + rc = process_vm_rw_single_vec( + (unsigned long)rvec[i].iov_base, rvec[i].iov_len, + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; + + mmput(mm); + +put_task_struct: + put_task_struct(task); + +free_proc_pages: + if (process_pages != pp_stack) + kfree(process_pages); + return rc; +} + +/** + * process_vm_rw - check iovecs before calling core routine + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r; + struct iov_iter iter; + ssize_t rc; + int dir = vm_write ? ITER_SOURCE : ITER_DEST; + + if (flags != 0) + return -EINVAL; + + /* Check iovecs */ + rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); + if (rc < 0) + return rc; + if (!iov_iter_count(&iter)) + goto free_iov_l; + iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, + in_compat_syscall()); + if (IS_ERR(iov_r)) { + rc = PTR_ERR(iov_r); + goto free_iov_l; + } + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); + if (iov_r != iovstack_r) + kfree(iov_r); +free_iov_l: + kfree(iov_l); + return rc; +} + +SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); +} + +SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, + const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); +} diff --git a/mm/ptdump.c b/mm/ptdump.c new file mode 100644 index 000000000..8adab455a --- /dev/null +++ b/mm/ptdump.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +/* + * This is an optimization for KASAN=y case. Since all kasan page tables + * eventually point to the kasan_early_shadow_page we could call note_page() + * right away without walking through lower level page tables. This saves + * us dozens of seconds (minutes for 5-level config) while checking for + * W+X mapping or reading kernel_page_tables debugfs file. + */ +static inline int note_kasan_page_table(struct mm_walk *walk, + unsigned long addr) +{ + struct ptdump_state *st = walk->private; + + st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0])); + + walk->action = ACTION_CONTINUE; + + return 0; +} +#endif + +static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pgd_t val = READ_ONCE(*pgd); + +#if CONFIG_PGTABLE_LEVELS > 4 && \ + (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) + if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d))) + return note_kasan_page_table(walk, addr); +#endif + + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + + if (pgd_leaf(val)) { + st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + p4d_t val = READ_ONCE(*p4d); + +#if CONFIG_PGTABLE_LEVELS > 3 && \ + (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) + if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud))) + return note_kasan_page_table(walk, addr); +#endif + + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + + if (p4d_leaf(val)) { + st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int ptdump_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pud_t val = READ_ONCE(*pud); + +#if CONFIG_PGTABLE_LEVELS > 2 && \ + (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) + if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd))) + return note_kasan_page_table(walk, addr); +#endif + + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + + if (pud_leaf(val)) { + st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pmd_t val = READ_ONCE(*pmd); + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte))) + return note_kasan_page_table(walk, addr); +#endif + + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); + if (pmd_leaf(val)) { + st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + } + + return 0; +} + +static int ptdump_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + pte_t val = ptep_get(pte); + + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); + + st->note_page(st, addr, 4, pte_val(val)); + + return 0; +} + +static int ptdump_hole(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + struct ptdump_state *st = walk->private; + + st->note_page(st, addr, depth, 0); + + return 0; +} + +static const struct mm_walk_ops ptdump_ops = { + .pgd_entry = ptdump_pgd_entry, + .p4d_entry = ptdump_p4d_entry, + .pud_entry = ptdump_pud_entry, + .pmd_entry = ptdump_pmd_entry, + .pte_entry = ptdump_pte_entry, + .pte_hole = ptdump_hole, +}; + +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) +{ + const struct ptdump_range *range = st->range; + + mmap_write_lock(mm); + while (range->start != range->end) { + walk_page_range_novma(mm, range->start, range->end, + &ptdump_ops, pgd, st); + range++; + } + mmap_write_unlock(mm); + + /* Flush out the last page */ + st->note_page(st, 0, -1, 0); +} diff --git a/mm/readahead.c b/mm/readahead.c new file mode 100644 index 000000000..ba4342804 --- /dev/null +++ b/mm/readahead.c @@ -0,0 +1,851 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/readahead.c - address_space-level file readahead. + * + * Copyright (C) 2002, Linus Torvalds + * + * 09Apr2002 Andrew Morton + * Initial version. + */ + +/** + * DOC: Readahead Overview + * + * Readahead is used to read content into the page cache before it is + * explicitly requested by the application. Readahead only ever + * attempts to read folios that are not yet in the page cache. If a + * folio is present but not up-to-date, readahead will not try to read + * it. In that case a simple ->read_folio() will be requested. + * + * Readahead is triggered when an application read request (whether a + * system call or a page fault) finds that the requested folio is not in + * the page cache, or that it is in the page cache and has the + * readahead flag set. This flag indicates that the folio was read + * as part of a previous readahead request and now that it has been + * accessed, it is time for the next readahead. + * + * Each readahead request is partly synchronous read, and partly async + * readahead. This is reflected in the struct file_ra_state which + * contains ->size being the total number of pages, and ->async_size + * which is the number of pages in the async section. The readahead + * flag will be set on the first folio in this async section to trigger + * a subsequent readahead. Once a series of sequential reads has been + * established, there should be no need for a synchronous component and + * all readahead request will be fully asynchronous. + * + * When either of the triggers causes a readahead, three numbers need + * to be determined: the start of the region to read, the size of the + * region, and the size of the async tail. + * + * The start of the region is simply the first page address at or after + * the accessed address, which is not currently populated in the page + * cache. This is found with a simple search in the page cache. + * + * The size of the async tail is determined by subtracting the size that + * was explicitly requested from the determined request size, unless + * this would be less than zero - then zero is used. NOTE THIS + * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED + * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY. + * + * The size of the region is normally determined from the size of the + * previous readahead which loaded the preceding pages. This may be + * discovered from the struct file_ra_state for simple sequential reads, + * or from examining the state of the page cache when multiple + * sequential reads are interleaved. Specifically: where the readahead + * was triggered by the readahead flag, the size of the previous + * readahead is assumed to be the number of pages from the triggering + * page to the start of the new readahead. In these cases, the size of + * the previous readahead is scaled, often doubled, for the new + * readahead, though see get_next_ra_size() for details. + * + * If the size of the previous read cannot be determined, the number of + * preceding pages in the page cache is used to estimate the size of + * a previous read. This estimate could easily be misled by random + * reads being coincidentally adjacent, so it is ignored unless it is + * larger than the current request, and it is not scaled up, unless it + * is at the start of file. + * + * In general readahead is accelerated at the start of the file, as + * reads from there are often sequential. There are other minor + * adjustments to the readahead size in various special cases and these + * are best discovered by reading the code. + * + * The above calculation, based on the previous readahead size, + * determines the size of the readahead, to which any requested read + * size may be added. + * + * Readahead requests are sent to the filesystem using the ->readahead() + * address space operation, for which mpage_readahead() is a canonical + * implementation. ->readahead() should normally initiate reads on all + * folios, but may fail to read any or all folios without causing an I/O + * error. The page cache reading code will issue a ->read_folio() request + * for any folio which ->readahead() did not read, and only an error + * from this will be final. + * + * ->readahead() will generally call readahead_folio() repeatedly to get + * each folio from those prepared for readahead. It may fail to read a + * folio by: + * + * * not calling readahead_folio() sufficiently many times, effectively + * ignoring some folios, as might be appropriate if the path to + * storage is congested. + * + * * failing to actually submit a read request for a given folio, + * possibly due to insufficient resources, or + * + * * getting an error during subsequent processing of a request. + * + * In the last two cases, the folio should be unlocked by the filesystem + * to indicate that the read attempt has failed. In the first case the + * folio will be unlocked by the VFS. + * + * Those folios not in the final ``async_size`` of the request should be + * considered to be important and ->readahead() should not fail them due + * to congestion or temporary resource unavailability, but should wait + * for necessary resources (e.g. memory or indexing information) to + * become available. Folios in the final ``async_size`` may be + * considered less urgent and failure to read them is more acceptable. + * In this case it is best to use filemap_remove_folio() to remove the + * folios from the page cache as is automatically done for folios that + * were not fetched with readahead_folio(). This will allow a + * subsequent synchronous readahead request to try them again. If they + * are left in the page cache, then they will be read individually using + * ->read_folio() which may be less efficient. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/* + * Initialise a struct file's readahead state. Assumes that the caller has + * memset *ra to zero. + */ +void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +{ + ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; + ra->prev_pos = -1; +} +EXPORT_SYMBOL_GPL(file_ra_state_init); + +static void read_pages(struct readahead_control *rac) +{ + const struct address_space_operations *aops = rac->mapping->a_ops; + struct folio *folio; + struct blk_plug plug; + + if (!readahead_count(rac)) + return; + + if (unlikely(rac->_workingset)) + psi_memstall_enter(&rac->_pflags); + blk_start_plug(&plug); + + if (aops->readahead) { + aops->readahead(rac); + /* + * Clean up the remaining folios. The sizes in ->ra + * may be used to size the next readahead, so make sure + * they accurately reflect what happened. + */ + while ((folio = readahead_folio(rac)) != NULL) { + unsigned long nr = folio_nr_pages(folio); + + folio_get(folio); + rac->ra->size -= nr; + if (rac->ra->async_size >= nr) { + rac->ra->async_size -= nr; + filemap_remove_folio(folio); + } + folio_unlock(folio); + folio_put(folio); + } + } else { + while ((folio = readahead_folio(rac)) != NULL) + aops->read_folio(rac->file, folio); + } + + blk_finish_plug(&plug); + if (unlikely(rac->_workingset)) + psi_memstall_leave(&rac->_pflags); + rac->_workingset = false; + + BUG_ON(readahead_count(rac)); +} + +/** + * page_cache_ra_unbounded - Start unchecked readahead. + * @ractl: Readahead control. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. + * + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. + */ +void page_cache_ra_unbounded(struct readahead_control *ractl, + unsigned long nr_to_read, unsigned long lookahead_size) +{ + struct address_space *mapping = ractl->mapping; + unsigned long index = readahead_index(ractl); + gfp_t gfp_mask = readahead_gfp_mask(mapping); + unsigned long i; + + /* + * Partway through the readahead operation, we will have added + * locked pages to the page cache, but will not yet have submitted + * them for I/O. Adding another page may need to allocate memory, + * which can trigger memory reclaim. Telling the VM we're in + * the middle of a filesystem operation will cause it to not + * touch file-backed pages, preventing a deadlock. Most (all?) + * filesystems already specify __GFP_NOFS in their mapping's + * gfp_mask, but let's be explicit here. + */ + unsigned int nofs = memalloc_nofs_save(); + + filemap_invalidate_lock_shared(mapping); + /* + * Preallocate as many pages as we will need. + */ + for (i = 0; i < nr_to_read; i++) { + struct folio *folio = xa_load(&mapping->i_pages, index + i); + + if (folio && !xa_is_value(folio)) { + /* + * Page already present? Kick off the current batch + * of contiguous pages before continuing with the + * next batch. This page may be the one we would + * have intended to mark as Readahead, but we don't + * have a stable reference to this page, and it's + * not worth getting one just for that. + */ + read_pages(ractl); + ractl->_index++; + i = ractl->_index + ractl->_nr_pages - index - 1; + continue; + } + + folio = filemap_alloc_folio(gfp_mask, 0); + if (!folio) + break; + if (filemap_add_folio(mapping, folio, index + i, + gfp_mask) < 0) { + folio_put(folio); + read_pages(ractl); + ractl->_index++; + i = ractl->_index + ractl->_nr_pages - index - 1; + continue; + } + if (i == nr_to_read - lookahead_size) + folio_set_readahead(folio); + ractl->_workingset |= folio_test_workingset(folio); + ractl->_nr_pages++; + } + + /* + * Now start the IO. We ignore I/O errors - if the folio is not + * uptodate then the caller will launch read_folio again, and + * will then handle the error. + */ + read_pages(ractl); + filemap_invalidate_unlock_shared(mapping); + memalloc_nofs_restore(nofs); +} +EXPORT_SYMBOL_GPL(page_cache_ra_unbounded); + +/* + * do_page_cache_ra() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +static void do_page_cache_ra(struct readahead_control *ractl, + unsigned long nr_to_read, unsigned long lookahead_size) +{ + struct inode *inode = ractl->mapping->host; + unsigned long index = readahead_index(ractl); + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size); +} + +/* + * Chunk the readahead into 2 megabyte units, so that we don't pin too much + * memory at once. + */ +void force_page_cache_ra(struct readahead_control *ractl, + unsigned long nr_to_read) +{ + struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long max_pages, index; + + if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) + return; + + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + index = readahead_index(ractl); + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + nr_to_read = min_t(unsigned long, nr_to_read, max_pages); + while (nr_to_read) { + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; + + if (this_chunk > nr_to_read) + this_chunk = nr_to_read; + ractl->_index = index; + do_page_cache_ra(ractl, this_chunk, 0); + + index += this_chunk; + nr_to_read -= this_chunk; + } +} + +/* + * Set the initial window size, round to next power of 2 and square + * for small size, x 4 for medium, and x 2 for large + * for 128k (32 page) max ra + * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial + */ +static unsigned long get_init_ra_size(unsigned long size, unsigned long max) +{ + unsigned long newsize = roundup_pow_of_two(size); + + if (newsize <= max / 32) + newsize = newsize * 4; + else if (newsize <= max / 4) + newsize = newsize * 2; + else + newsize = max; + + return newsize; +} + +/* + * Get the previous window size, ramp it up, and + * return it as the new window size. + */ +static unsigned long get_next_ra_size(struct file_ra_state *ra, + unsigned long max) +{ + unsigned long cur = ra->size; + + if (cur < max / 16) + return 4 * cur; + if (cur <= max / 2) + return 2 * cur; + return max; +} + +/* + * On-demand readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * |<----- async_size ---------| + * |------------------- size -------------------->| + * |==================#===========================| + * ^start ^page marked with PG_readahead + * + * To overlap application thinking time and disk I/O time, we do + * `readahead pipelining': Do not wait until the application consumed all + * readahead pages and stalled on the missing page at readahead_index; + * Instead, submit an asynchronous readahead I/O as soon as there are + * only async_size pages left in the readahead window. Normally async_size + * will be equal to size, for maximum pipelining. + * + * In interleaved sequential reads, concurrent streams on the same fd can + * be invalidating each other's readahead state. So we flag the new readahead + * page at (start+size-async_size) with PG_readahead, and use it as readahead + * indicator. The flag won't be set on already cached pages, to avoid the + * readahead-for-nothing fuss, saving pointless page cache lookups. + * + * prev_pos tracks the last visited byte in the _previous_ read request. + * It should be maintained by the caller, and will be used for detecting + * small random reads. Note that the readahead algorithm checks loosely + * for sequential patterns. Hence interleaved reads might be served as + * sequential ones. + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to the initial size + * based on I/O request size and the max_readahead. + * + * The code ramps up the readahead size aggressively at first, but slow down as + * it approaches max_readhead. + */ + +/* + * Count contiguously cached pages from @index-1 to @index-@max, + * this count is a conservative estimation of + * - length of the sequential read sequence, or + * - thrashing threshold in memory tight systems + */ +static pgoff_t count_history_pages(struct address_space *mapping, + pgoff_t index, unsigned long max) +{ + pgoff_t head; + + rcu_read_lock(); + head = page_cache_prev_miss(mapping, index - 1, max); + rcu_read_unlock(); + + return index - 1 - head; +} + +/* + * page cache context based readahead + */ +static int try_context_readahead(struct address_space *mapping, + struct file_ra_state *ra, + pgoff_t index, + unsigned long req_size, + unsigned long max) +{ + pgoff_t size; + + size = count_history_pages(mapping, index, max); + + /* + * not enough history pages: + * it could be a random read + */ + if (size <= req_size) + return 0; + + /* + * starts from beginning of file: + * it is a strong indication of long-run stream (or whole-file-read) + */ + if (size >= index) + size *= 2; + + ra->start = index; + ra->size = min(size + req_size, max); + ra->async_size = 1; + + return 1; +} + +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + +static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, + pgoff_t mark, unsigned int order, gfp_t gfp) +{ + int err; + struct folio *folio = filemap_alloc_folio(gfp, order); + + if (!folio) + return -ENOMEM; + mark = round_up(mark, 1UL << order); + if (index == mark) + folio_set_readahead(folio); + err = filemap_add_folio(ractl->mapping, folio, index, gfp); + if (err) { + folio_put(folio); + return err; + } + + ractl->_nr_pages += 1UL << order; + ractl->_workingset |= folio_test_workingset(folio); + return 0; +} + +void page_cache_ra_order(struct readahead_control *ractl, + struct file_ra_state *ra, unsigned int new_order) +{ + struct address_space *mapping = ractl->mapping; + pgoff_t index = readahead_index(ractl); + pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; + pgoff_t mark = index + ra->size - ra->async_size; + int err = 0; + gfp_t gfp = readahead_gfp_mask(mapping); + + if (!mapping_large_folio_support(mapping) || ra->size < 4) + goto fallback; + + limit = min(limit, index + ra->size - 1); + + if (new_order < MAX_PAGECACHE_ORDER) { + new_order += 2; + if (new_order > MAX_PAGECACHE_ORDER) + new_order = MAX_PAGECACHE_ORDER; + while ((1 << new_order) > ra->size) + new_order--; + } + + filemap_invalidate_lock_shared(mapping); + while (index <= limit) { + unsigned int order = new_order; + + /* Align with smaller pages if needed */ + if (index & ((1UL << order) - 1)) { + order = __ffs(index); + if (order == 1) + order = 0; + } + /* Don't allocate pages past EOF */ + while (index + (1UL << order) - 1 > limit) { + if (--order == 1) + order = 0; + } + err = ra_alloc_folio(ractl, index, mark, order, gfp); + if (err) + break; + index += 1UL << order; + } + + if (index > limit) { + ra->size += index - limit - 1; + ra->async_size += index - limit - 1; + } + + read_pages(ractl); + filemap_invalidate_unlock_shared(mapping); + + /* + * If there were already pages in the page cache, then we may have + * left some gaps. Let the regular readahead code take care of this + * situation. + */ + if (!err) + return; +fallback: + do_page_cache_ra(ractl, ra->size, ra->async_size); +} + +/* + * A minimal readahead algorithm for trivial sequential/random reads. + */ +static void ondemand_readahead(struct readahead_control *ractl, + struct folio *folio, unsigned long req_size) +{ + struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host); + struct file_ra_state *ra = ractl->ra; + unsigned long max_pages = ra->ra_pages; + unsigned long add_pages; + pgoff_t index = readahead_index(ractl); + pgoff_t expected, prev_index; + unsigned int order = folio ? folio_order(folio) : 0; + + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + if (req_size > max_pages && bdi->io_pages > max_pages) + max_pages = min(req_size, bdi->io_pages); + + /* + * start of file + */ + if (!index) + goto initial_readahead; + + /* + * It's the expected callback index, assume sequential access. + * Ramp up sizes, and push forward the readahead window. + */ + expected = round_up(ra->start + ra->size - ra->async_size, + 1UL << order); + if (index == expected || index == (ra->start + ra->size)) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max_pages); + ra->async_size = ra->size; + goto readit; + } + + /* + * Hit a marked folio without valid readahead state. + * E.g. interleaved reads. + * Query the pagecache for async_size, which normally equals to + * readahead size. Ramp it up and use it as the new readahead size. + */ + if (folio) { + pgoff_t start; + + rcu_read_lock(); + start = page_cache_next_miss(ractl->mapping, index + 1, + max_pages); + rcu_read_unlock(); + + if (!start || start - index > max_pages) + return; + + ra->start = start; + ra->size = start - index; /* old async_size */ + ra->size += req_size; + ra->size = get_next_ra_size(ra, max_pages); + ra->async_size = ra->size; + goto readit; + } + + /* + * oversize read + */ + if (req_size > max_pages) + goto initial_readahead; + + /* + * sequential cache miss + * trivial case: (index - prev_index) == 1 + * unaligned reads: (index - prev_index) == 0 + */ + prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; + if (index - prev_index <= 1UL) + goto initial_readahead; + + /* + * Query the page cache and look for the traces(cached history pages) + * that a sequential stream would leave behind. + */ + if (try_context_readahead(ractl->mapping, ra, index, req_size, + max_pages)) + goto readit; + + /* + * standalone, small random read + * Read as is, and do not pollute the readahead state. + */ + do_page_cache_ra(ractl, req_size, 0); + return; + +initial_readahead: + ra->start = index; + ra->size = get_init_ra_size(req_size, max_pages); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; + +readit: + /* + * Will this read hit the readahead marker made by itself? + * If so, trigger the readahead marker hit now, and merge + * the resulted next readahead window into the current one. + * Take care of maximum IO pages as above. + */ + if (index == ra->start && ra->size == ra->async_size) { + add_pages = get_next_ra_size(ra, max_pages); + if (ra->size + add_pages <= max_pages) { + ra->async_size = add_pages; + ra->size += add_pages; + } else { + ra->size = max_pages; + ra->async_size = max_pages >> 1; + } + } + + ractl->_index = ra->start; + page_cache_ra_order(ractl, ra, order); +} + +void page_cache_sync_ra(struct readahead_control *ractl, + unsigned long req_count) +{ + bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM); + + /* + * Even if readahead is disabled, issue this request as readahead + * as we'll need it to satisfy the requested range. The forced + * readahead will do the right thing and limit the read to just the + * requested range, which we'll set to 1 page for this case. + */ + if (!ractl->ra->ra_pages || blk_cgroup_congested()) { + if (!ractl->file) + return; + req_count = 1; + do_forced_ra = true; + } + + /* be dumb */ + if (do_forced_ra) { + force_page_cache_ra(ractl, req_count); + return; + } + + ondemand_readahead(ractl, NULL, req_count); +} +EXPORT_SYMBOL_GPL(page_cache_sync_ra); + +void page_cache_async_ra(struct readahead_control *ractl, + struct folio *folio, unsigned long req_count) +{ + /* no readahead */ + if (!ractl->ra->ra_pages) + return; + + /* + * Same bit is used for PG_readahead and PG_reclaim. + */ + if (folio_test_writeback(folio)) + return; + + folio_clear_readahead(folio); + + if (blk_cgroup_congested()) + return; + + ondemand_readahead(ractl, folio, req_count); +} +EXPORT_SYMBOL_GPL(page_cache_async_ra); + +ssize_t ksys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct fd f; + + ret = -EBADF; + f = fdget(fd); + if (!f.file || !(f.file->f_mode & FMODE_READ)) + goto out; + + /* + * The readahead() syscall is intended to run only on files + * that can execute readahead. If readahead is not possible + * on this file, then we must return -EINVAL. + */ + ret = -EINVAL; + if (!f.file->f_mapping || !f.file->f_mapping->a_ops || + (!S_ISREG(file_inode(f.file)->i_mode) && + !S_ISBLK(file_inode(f.file)->i_mode))) + goto out; + + ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED); +out: + fdput(f); + return ret; +} + +SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) +{ + return ksys_readahead(fd, offset, count); +} + +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD) +COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count) +{ + return ksys_readahead(fd, compat_arg_u64_glue(offset), count); +} +#endif + +/** + * readahead_expand - Expand a readahead request + * @ractl: The request to be expanded + * @new_start: The revised start + * @new_len: The revised size of the request + * + * Attempt to expand a readahead request outwards from the current size to the + * specified size by inserting locked pages before and after the current window + * to increase the size to the new window. This may involve the insertion of + * THPs, in which case the window may get expanded even beyond what was + * requested. + * + * The algorithm will stop if it encounters a conflicting page already in the + * pagecache and leave a smaller expansion than requested. + * + * The caller must check for this by examining the revised @ractl object for a + * different expansion than was requested. + */ +void readahead_expand(struct readahead_control *ractl, + loff_t new_start, size_t new_len) +{ + struct address_space *mapping = ractl->mapping; + struct file_ra_state *ra = ractl->ra; + pgoff_t new_index, new_nr_pages; + gfp_t gfp_mask = readahead_gfp_mask(mapping); + + new_index = new_start / PAGE_SIZE; + + /* Expand the leading edge downwards */ + while (ractl->_index > new_index) { + unsigned long index = ractl->_index - 1; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + + ractl->_nr_pages++; + ractl->_index = page->index; + } + + new_len += new_start - readahead_pos(ractl); + new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE); + + /* Expand the trailing edge upwards */ + while (ractl->_nr_pages < new_nr_pages) { + unsigned long index = ractl->_index + ractl->_nr_pages; + struct page *page = xa_load(&mapping->i_pages, index); + + if (page && !xa_is_value(page)) + return; /* Page apparently present */ + + page = __page_cache_alloc(gfp_mask); + if (!page) + return; + if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) { + put_page(page); + return; + } + if (unlikely(PageWorkingset(page)) && !ractl->_workingset) { + ractl->_workingset = true; + psi_memstall_enter(&ractl->_pflags); + } + ractl->_nr_pages++; + if (ra) { + ra->size++; + ra->async_size++; + } + } +} +EXPORT_SYMBOL(readahead_expand); diff --git a/mm/rmap.c b/mm/rmap.c new file mode 100644 index 000000000..7da2d8d09 --- /dev/null +++ b/mm/rmap.c @@ -0,0 +1,2577 @@ +/* + * mm/rmap.c - physical to virtual reverse mappings + * + * Copyright 2001, Rik van Riel + * Released under the General Public License (GPL). + * + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. + * + * Provides methods for unmapping each kind of mapped page: + * the anon methods track anonymous pages, and + * the file methods track pages belonging to an inode. + * + * Original design by Rik van Riel 2001 + * File methods by Dave McCracken 2003, 2004 + * Anonymous methods by Andrea Arcangeli 2004 + * Contributions by Hugh Dickins 2003, 2004 + */ + +/* + * Lock ordering in mm: + * + * inode->i_rwsem (while writing or truncating, not reading or faulting) + * mm->mmap_lock + * mapping->invalidate_lock (in filemap_fault) + * page->flags PG_locked (lock_page) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) + * mapping->i_mmap_rwsem + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in block_dirty_folio) + * folio_lock_memcg move_lock (in block_dirty_folio) + * i_pages lock (widely used) + * lruvec->lru_lock (in folio_lruvec_lock_irq) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * i_pages lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) + * + * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) + * ->tasklist_lock + * pte map lock + * + * hugetlbfs PageHuge() take locks in this order: + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * vma_lock (hugetlb specific lock for pmd_sharing) + * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) + * page->flags PG_locked (lock_page) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CREATE_TRACE_POINTS +#include +#include + +#include "internal.h" + +static struct kmem_cache *anon_vma_cachep; +static struct kmem_cache *anon_vma_chain_cachep; + +static inline struct anon_vma *anon_vma_alloc(void) +{ + struct anon_vma *anon_vma; + + anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + if (anon_vma) { + atomic_set(&anon_vma->refcount, 1); + anon_vma->num_children = 0; + anon_vma->num_active_vmas = 0; + anon_vma->parent = anon_vma; + /* + * Initialise the anon_vma root to point to itself. If called + * from fork, the root will be reset to the parents anon_vma. + */ + anon_vma->root = anon_vma; + } + + return anon_vma; +} + +static inline void anon_vma_free(struct anon_vma *anon_vma) +{ + VM_BUG_ON(atomic_read(&anon_vma->refcount)); + + /* + * Synchronize against folio_lock_anon_vma_read() such that + * we can safely hold the lock without the anon_vma getting + * freed. + * + * Relies on the full mb implied by the atomic_dec_and_test() from + * put_anon_vma() against the acquire barrier implied by + * down_read_trylock() from folio_lock_anon_vma_read(). This orders: + * + * folio_lock_anon_vma_read() VS put_anon_vma() + * down_read_trylock() atomic_dec_and_test() + * LOCK MB + * atomic_read() rwsem_is_locked() + * + * LOCK should suffice since the actual taking of the lock must + * happen _before_ what follows. + */ + might_sleep(); + if (rwsem_is_locked(&anon_vma->root->rwsem)) { + anon_vma_lock_write(anon_vma); + anon_vma_unlock_write(anon_vma); + } + + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) +{ + return kmem_cache_alloc(anon_vma_chain_cachep, gfp); +} + +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +{ + kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); +} + +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) +{ + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); +} + +/** + * __anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * This makes sure the memory mapping described by 'vma' has + * an 'anon_vma' attached to it, so that we can associate the + * anonymous pages mapped into it with that anon_vma. + * + * The common case will be that we already have one, which + * is handled inline by anon_vma_prepare(). But if + * not we either need to find an adjacent mapping that we + * can re-use the anon_vma from (very common when the only + * reason for splitting a vma has been mprotect()), or we + * allocate a new one. + * + * Anon-vma allocations are very subtle, because we may have + * optimistically looked up an anon_vma in folio_lock_anon_vma_read() + * and that may actually touch the rwsem even in the newly + * allocated vma (it depends on RCU to make sure that the + * anon_vma isn't actually destroyed). + * + * As a result, we need to do proper anon_vma locking even + * for the new allocation. At the same time, we do not want + * to do any locking for the common case of already having + * an anon_vma. + * + * This must be called with the mmap_lock held for reading. + */ +int __anon_vma_prepare(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *anon_vma, *allocated; + struct anon_vma_chain *avc; + + might_sleep(); + + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + anon_vma->num_children++; /* self-parent link for new root */ + allocated = anon_vma; + } + + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + anon_vma->num_active_vmas++; + allocated = NULL; + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); + + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); + + return 0; + + out_enomem_free_avc: + anon_vma_chain_free(avc); + out_enomem: + return -ENOMEM; +} + +/* + * This is a useful helper function for locking the anon_vma root as + * we traverse the vma->anon_vma_chain, looping over anon_vma's that + * have the same vma. + * + * Such anon_vma's should have the same root, so you'd expect to see + * just a single mutex_lock for the whole traversal. + */ +static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) +{ + struct anon_vma *new_root = anon_vma->root; + if (new_root != root) { + if (WARN_ON_ONCE(root)) + up_write(&root->rwsem); + root = new_root; + down_write(&root->rwsem); + } + return root; +} + +static inline void unlock_anon_vma_root(struct anon_vma *root) +{ + if (root) + up_write(&root->rwsem); +} + +/* + * Attach the anon_vmas from src to dst. + * Returns 0 on success, -ENOMEM on failure. + * + * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and + * anon_vma_fork(). The first three want an exact copy of src, while the last + * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent + * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, + * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * + * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find + * and reuse existing anon_vma which has no vmas and only one child anon_vma. + * This prevents degradation of anon_vma hierarchy to endless linear chain in + * case of constantly forking task. On the other hand, an anon_vma with more + * than one child isn't reused even if there was no alive vma, thus rmap + * walker has a good chance of avoiding scanning the whole hierarchy when it + * searches where page is mapped. + */ +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) +{ + struct anon_vma_chain *avc, *pavc; + struct anon_vma *root = NULL; + + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma; + + avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + if (unlikely(!avc)) { + unlock_anon_vma_root(root); + root = NULL; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto enomem_failure; + } + anon_vma = pavc->anon_vma; + root = lock_anon_vma_root(root, anon_vma); + anon_vma_chain_link(dst, avc, anon_vma); + + /* + * Reuse existing anon_vma if it has no vma and only one + * anon_vma child. + * + * Root anon_vma is never reused: + * it has self-parent reference and at least one child. + */ + if (!dst->anon_vma && src->anon_vma && + anon_vma->num_children < 2 && + anon_vma->num_active_vmas == 0) + dst->anon_vma = anon_vma; + } + if (dst->anon_vma) + dst->anon_vma->num_active_vmas++; + unlock_anon_vma_root(root); + return 0; + + enomem_failure: + /* + * dst->anon_vma is dropped here otherwise its degree can be incorrectly + * decremented in unlink_anon_vmas(). + * We can safely do this because callers of anon_vma_clone() don't care + * about dst->anon_vma if anon_vma_clone() failed. + */ + dst->anon_vma = NULL; + unlink_anon_vmas(dst); + return -ENOMEM; +} + +/* + * Attach vma to its own anon_vma, as well as to the anon_vmas that + * the corresponding VMA in the parent process is attached to. + * Returns 0 on success, non-zero on failure. + */ +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) +{ + struct anon_vma_chain *avc; + struct anon_vma *anon_vma; + int error; + + /* Don't bother if the parent process has no anon_vma here. */ + if (!pvma->anon_vma) + return 0; + + /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ + vma->anon_vma = NULL; + + /* + * First, attach the new VMA to the parent VMA's anon_vmas, + * so rmap can find non-COWed pages in child processes. + */ + error = anon_vma_clone(vma, pvma); + if (error) + return error; + + /* An existing anon_vma has been reused, all done then. */ + if (vma->anon_vma) + return 0; + + /* Then add our own anon_vma. */ + anon_vma = anon_vma_alloc(); + if (!anon_vma) + goto out_error; + anon_vma->num_active_vmas++; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_error_free_anon_vma; + + /* + * The root anon_vma's rwsem is the lock actually used when we + * lock any of the anon_vmas in this anon_vma tree. + */ + anon_vma->root = pvma->anon_vma->root; + anon_vma->parent = pvma->anon_vma; + /* + * With refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned until + * this anon_vma is freed, because the lock lives in the root. + */ + get_anon_vma(anon_vma->root); + /* Mark this anon_vma as the one where our new (COWed) pages go. */ + vma->anon_vma = anon_vma; + anon_vma_lock_write(anon_vma); + anon_vma_chain_link(vma, avc, anon_vma); + anon_vma->parent->num_children++; + anon_vma_unlock_write(anon_vma); + + return 0; + + out_error_free_anon_vma: + put_anon_vma(anon_vma); + out_error: + unlink_anon_vmas(vma); + return -ENOMEM; +} + +void unlink_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + struct anon_vma *root = NULL; + + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + root = lock_anon_vma_root(root, anon_vma); + anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); + + /* + * Leave empty anon_vmas on the list - we'll need + * to free them outside the lock. + */ + if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { + anon_vma->parent->num_children--; + continue; + } + + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } + if (vma->anon_vma) { + vma->anon_vma->num_active_vmas--; + + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + } + unlock_anon_vma_root(root); + + /* + * Iterate the list once more, it now only contains empty and unlinked + * anon_vmas, destroy them. Could not do before due to __put_anon_vma() + * needing to write-acquire the anon_vma->root->rwsem. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = avc->anon_vma; + + VM_WARN_ON(anon_vma->num_children); + VM_WARN_ON(anon_vma->num_active_vmas); + put_anon_vma(anon_vma); + + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } +} + +static void anon_vma_ctor(void *data) +{ + struct anon_vma *anon_vma = data; + + init_rwsem(&anon_vma->rwsem); + atomic_set(&anon_vma->refcount, 0); + anon_vma->rb_root = RB_ROOT_CACHED; +} + +void __init anon_vma_init(void) +{ + anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), + 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, + SLAB_PANIC|SLAB_ACCOUNT); +} + +/* + * Getting a lock on a stable anon_vma from a page off the LRU is tricky! + * + * Since there is no serialization what so ever against page_remove_rmap() + * the best this function can do is return a refcount increased anon_vma + * that might have been relevant to this page. + * + * The page might have been remapped to a different anon_vma or the anon_vma + * returned may already be freed (and even reused). + * + * In case it was remapped to a different anon_vma, the new anon_vma will be a + * child of the old anon_vma, and the anon_vma lifetime rules will therefore + * ensure that any anon_vma obtained from the page will still be valid for as + * long as we observe page_mapped() [ hence all those page_mapped() tests ]. + * + * All users of this function must be very careful when walking the anon_vma + * chain and verify that the page in question is indeed mapped in it + * [ something equivalent to page_mapped_in_vma() ]. + * + * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from + * page_remove_rmap() that the anon_vma pointer from page->mapping is valid + * if there is a mapcount, we can dereference the anon_vma after observing + * those. + */ +struct anon_vma *folio_get_anon_vma(struct folio *folio) +{ + struct anon_vma *anon_vma = NULL; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!folio_mapped(folio)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + /* + * If this folio is still mapped, then its anon_vma cannot have been + * freed. But if it has been unmapped, we have no security against the + * anon_vma structure being freed and reused (for another anon_vma: + * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() + * above cannot corrupt). + */ + if (!folio_mapped(folio)) { + rcu_read_unlock(); + put_anon_vma(anon_vma); + return NULL; + } +out: + rcu_read_unlock(); + + return anon_vma; +} + +/* + * Similar to folio_get_anon_vma() except it locks the anon_vma. + * + * Its a little more complex as it tries to keep the fast path to a single + * atomic op -- the trylock. If we fail the trylock, we fall back to getting a + * reference like with folio_get_anon_vma() and then block on the mutex + * on !rwc->try_lock case. + */ +struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, + struct rmap_walk_control *rwc) +{ + struct anon_vma *anon_vma = NULL; + struct anon_vma *root_anon_vma; + unsigned long anon_mapping; + + rcu_read_lock(); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + goto out; + if (!folio_mapped(folio)) + goto out; + + anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + root_anon_vma = READ_ONCE(anon_vma->root); + if (down_read_trylock(&root_anon_vma->rwsem)) { + /* + * If the folio is still mapped, then this anon_vma is still + * its anon_vma, and holding the mutex ensures that it will + * not go away, see anon_vma_free(). + */ + if (!folio_mapped(folio)) { + up_read(&root_anon_vma->rwsem); + anon_vma = NULL; + } + goto out; + } + + if (rwc && rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + + /* trylock failed, we got to sleep */ + if (!atomic_inc_not_zero(&anon_vma->refcount)) { + anon_vma = NULL; + goto out; + } + + if (!folio_mapped(folio)) { + rcu_read_unlock(); + put_anon_vma(anon_vma); + return NULL; + } + + /* we pinned the anon_vma, its safe to sleep */ + rcu_read_unlock(); + anon_vma_lock_read(anon_vma); + + if (atomic_dec_and_test(&anon_vma->refcount)) { + /* + * Oops, we held the last refcount, release the lock + * and bail -- can't simply use put_anon_vma() because + * we'll deadlock on the anon_vma_lock_write() recursion. + */ + anon_vma_unlock_read(anon_vma); + __put_anon_vma(anon_vma); + anon_vma = NULL; + } + + return anon_vma; + +out: + rcu_read_unlock(); + return anon_vma; +} + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +/* + * Flush TLB entries for recently unmapped pages from remote CPUs. It is + * important if a PTE was dirty when it was unmapped that it's flushed + * before any IO is initiated on the page to prevent lost writes. Similarly, + * it must be flushed before freeing to prevent data leakage. + */ +void try_to_unmap_flush(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + if (!tlb_ubc->flush_required) + return; + + arch_tlbbatch_flush(&tlb_ubc->arch); + tlb_ubc->flush_required = false; + tlb_ubc->writable = false; +} + +/* Flush iff there are potentially writable TLB entries that can race with IO */ +void try_to_unmap_flush_dirty(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + if (tlb_ubc->writable) + try_to_unmap_flush(); +} + +/* + * Bits 0-14 of mm->tlb_flush_batched record pending generations. + * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. + */ +#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 +#define TLB_FLUSH_BATCH_PENDING_MASK \ + ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) +#define TLB_FLUSH_BATCH_PENDING_LARGE \ + (TLB_FLUSH_BATCH_PENDING_MASK / 2) + +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int batch, nbatch; + + arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); + tlb_ubc->flush_required = true; + + /* + * Ensure compiler does not re-order the setting of tlb_flush_batched + * before the PTE is cleared. + */ + barrier(); + batch = atomic_read(&mm->tlb_flush_batched); +retry: + if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { + /* + * Prevent `pending' from catching up with `flushed' because of + * overflow. Reset `pending' and `flushed' to be 1 and 0 if + * `pending' becomes large. + */ + nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1); + if (nbatch != batch) { + batch = nbatch; + goto retry; + } + } else { + atomic_inc(&mm->tlb_flush_batched); + } + + /* + * If the PTE was dirty then it's best to assume it's writable. The + * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() + * before the page is queued for IO. + */ + if (writable) + tlb_ubc->writable = true; +} + +/* + * Returns true if the TLB flush should be deferred to the end of a batch of + * unmap operations to reduce IPIs. + */ +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + bool should_defer = false; + + if (!(flags & TTU_BATCH_FLUSH)) + return false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + +/* + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to + * releasing the PTL if TLB flushes are batched. It's possible for a parallel + * operation such as mprotect or munmap to race between reclaim unmapping + * the page and flushing the page. If this race occurs, it potentially allows + * access to data via a stale TLB entry. Tracking all mm's that have TLB + * batching in flight would be expensive during reclaim so instead track + * whether TLB batching occurred in the past and if so then do a flush here + * if required. This will cost one additional flush per reclaim cycle paid + * by the first operation at risk such as mprotect and mumap. + * + * This must be called under the PTL so that an access to tlb_flush_batched + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise + * via the PTL. + */ +void flush_tlb_batched_pending(struct mm_struct *mm) +{ + int batch = atomic_read(&mm->tlb_flush_batched); + int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; + int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; + + if (pending != flushed) { + flush_tlb_mm(mm); + /* + * If the new TLB flushing is pending during flushing, leave + * mm->tlb_flush_batched as is, to avoid losing flushing. + */ + atomic_cmpxchg(&mm->tlb_flush_batched, batch, + pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); + } +} +#else +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) +{ +} + +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +/* + * At what user virtual address is page expected in vma? + * Caller should check the page is actually part of the vma. + */ +unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +{ + struct folio *folio = page_folio(page); + if (folio_test_anon(folio)) { + struct anon_vma *page__anon_vma = folio_anon_vma(folio); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) + return -EFAULT; + } else if (!vma->vm_file) { + return -EFAULT; + } else if (vma->vm_file->f_mapping != folio->mapping) { + return -EFAULT; + } + + return vma_address(page, vma); +} + +/* + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* + * represents. + */ +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + goto out; + + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); +out: + return pmd; +} + +struct folio_referenced_arg { + int mapcount; + int referenced; + unsigned long vm_flags; + struct mem_cgroup *memcg; +}; +/* + * arg: folio_referenced_arg will be passed + */ +static bool folio_referenced_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long address, void *arg) +{ + struct folio_referenced_arg *pra = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + int referenced = 0; + + while (page_vma_mapped_walk(&pvmw)) { + address = pvmw.address; + + if ((vma->vm_flags & VM_LOCKED) && + (!folio_test_large(folio) || !pvmw.pte)) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma, !pvmw.pte); + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + + if (pvmw.pte) { + if (lru_gen_enabled() && pte_young(*pvmw.pte) && + !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) { + lru_gen_look_around(&pvmw); + referenced++; + } + + if (ptep_clear_flush_young_notify(vma, address, + pvmw.pte)) { + /* + * Don't treat a reference through + * a sequentially read mapping as such. + * If the folio has been used in another mapping, + * we will catch it; if this other mapping is + * already gone, the unmap path will have set + * the referenced flag or activated the folio. + */ + if (likely(!(vma->vm_flags & VM_SEQ_READ))) + referenced++; + } + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, + pvmw.pmd)) + referenced++; + } else { + /* unexpected pmd-mapped folio? */ + WARN_ON_ONCE(1); + } + + pra->mapcount--; + } + + if (referenced) + folio_clear_idle(folio); + if (folio_test_clear_young(folio)) + referenced++; + + if (referenced) { + pra->referenced++; + pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; + } + + if (!pra->mapcount) + return false; /* To break the loop */ + + return true; +} + +static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) +{ + struct folio_referenced_arg *pra = arg; + struct mem_cgroup *memcg = pra->memcg; + + if (!mm_match_cgroup(vma->vm_mm, memcg)) + return true; + + return false; +} + +/** + * folio_referenced() - Test if the folio was referenced. + * @folio: The folio to test. + * @is_locked: Caller holds lock on the folio. + * @memcg: target memory cgroup + * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. + * + * Quick test_and_clear_referenced for all mappings of a folio, + * + * Return: The number of mappings which referenced the folio. Return -1 if + * the function bailed out due to rmap lock contention. + */ +int folio_referenced(struct folio *folio, int is_locked, + struct mem_cgroup *memcg, unsigned long *vm_flags) +{ + int we_locked = 0; + struct folio_referenced_arg pra = { + .mapcount = folio_mapcount(folio), + .memcg = memcg, + }; + struct rmap_walk_control rwc = { + .rmap_one = folio_referenced_one, + .arg = (void *)&pra, + .anon_lock = folio_lock_anon_vma_read, + .try_lock = true, + }; + + *vm_flags = 0; + if (!pra.mapcount) + return 0; + + if (!folio_raw_mapping(folio)) + return 0; + + if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { + we_locked = folio_trylock(folio); + if (!we_locked) + return 1; + } + + /* + * If we are reclaiming on behalf of a cgroup, skip + * counting on behalf of references from different + * cgroups + */ + if (memcg) { + rwc.invalid_vma = invalid_folio_referenced_vma; + } + + rmap_walk(folio, &rwc); + *vm_flags = pra.vm_flags; + + if (we_locked) + folio_unlock(folio); + + return rwc.contended ? -1 : pra.referenced; +} + +static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) +{ + int cleaned = 0; + struct vm_area_struct *vma = pvmw->vma; + struct mmu_notifier_range range; + unsigned long address = pvmw->address; + + /* + * We have to assume the worse case ie pmd for invalidation. Note that + * the folio can not be freed from this function. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, + vma_address_end(pvmw)); + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(pvmw)) { + int ret = 0; + + address = pvmw->address; + if (pvmw->pte) { + pte_t entry; + pte_t *pte = pvmw->pte; + + if (!pte_dirty(*pte) && !pte_write(*pte)) + continue; + + flush_cache_page(vma, address, pte_pfn(*pte)); + entry = ptep_clear_flush(vma, address, pte); + entry = pte_wrprotect(entry); + entry = pte_mkclean(entry); + set_pte_at(vma->vm_mm, address, pte, entry); + ret = 1; + } else { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t *pmd = pvmw->pmd; + pmd_t entry; + + if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) + continue; + + flush_cache_range(vma, address, + address + HPAGE_PMD_SIZE); + entry = pmdp_invalidate(vma, address, pmd); + entry = pmd_wrprotect(entry); + entry = pmd_mkclean(entry); + set_pmd_at(vma->vm_mm, address, pmd, entry); + ret = 1; +#else + /* unexpected pmd-mapped folio? */ + WARN_ON_ONCE(1); +#endif + } + + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/mm/mmu_notifier.rst + */ + if (ret) + cleaned++; + } + + mmu_notifier_invalidate_range_end(&range); + + return cleaned; +} + +static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); + int *cleaned = arg; + + *cleaned += page_vma_mkclean_one(&pvmw); + + return true; +} + +static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) +{ + if (vma->vm_flags & VM_SHARED) + return false; + + return true; +} + +int folio_mkclean(struct folio *folio) +{ + int cleaned = 0; + struct address_space *mapping; + struct rmap_walk_control rwc = { + .arg = (void *)&cleaned, + .rmap_one = page_mkclean_one, + .invalid_vma = invalid_mkclean_vma, + }; + + BUG_ON(!folio_test_locked(folio)); + + if (!folio_mapped(folio)) + return 0; + + mapping = folio_mapping(folio); + if (!mapping) + return 0; + + rmap_walk(folio, &rwc); + + return cleaned; +} +EXPORT_SYMBOL_GPL(folio_mkclean); + +/** + * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of + * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) + * within the @vma of shared mappings. And since clean PTEs + * should also be readonly, write protects them too. + * @pfn: start pfn. + * @nr_pages: number of physically contiguous pages srarting with @pfn. + * @pgoff: page offset that the @pfn mapped with. + * @vma: vma that @pfn mapped within. + * + * Returns the number of cleaned PTEs (including PMDs). + */ +int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, + struct vm_area_struct *vma) +{ + struct page_vma_mapped_walk pvmw = { + .pfn = pfn, + .nr_pages = nr_pages, + .pgoff = pgoff, + .vma = vma, + .flags = PVMW_SYNC, + }; + + if (invalid_mkclean_vma(vma, NULL)) + return 0; + + pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); + VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); + + return page_vma_mkclean_one(&pvmw); +} + +/** + * page_move_anon_rmap - move a page to our anon_vma + * @page: the page to move to our anon_vma + * @vma: the vma the page belongs to + * + * When a page belongs exclusively to one process after a COW event, + * that page can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling + * processes. + */ +void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) +{ + void *anon_vma = vma->anon_vma; + struct folio *folio = page_folio(page); + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_VMA(!anon_vma, vma); + + anon_vma += PAGE_MAPPING_ANON; + /* + * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * simultaneously, so a concurrent reader (eg folio_referenced()'s + * folio_test_anon()) will not see one without the other. + */ + WRITE_ONCE(folio->mapping, anon_vma); + SetPageAnonExclusive(page); +} + +/** + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page or Hugepage to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + goto out; + + /* + * If the page isn't exclusively mapped into this vma, + * we must use the _oldest_ possible anon_vma for the + * page mapping! + */ + if (!exclusive) + anon_vma = anon_vma->root; + + /* + * page_idle does a lockless/optimistic rmap scan on page->mapping. + * Make sure the compiler doesn't split the stores of anon_vma and + * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code + * could mistake the mapping for a struct address_space and crash. + */ + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); + page->index = linear_page_index(vma, address); +out: + if (exclusive) + SetPageAnonExclusive(page); +} + +/** + * __page_check_anon_rmap - sanity check anonymous rmap addition + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + */ +static void __page_check_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct folio *folio = page_folio(page); + /* + * The page's anon-rmap details (mapping and index) are guaranteed to + * be set up correctly at this point. + * + * We have exclusion against page_add_anon_rmap because the caller + * always holds the page locked. + * + * We have exclusion against page_add_new_anon_rmap because those pages + * are initially only visible via the pagetables, and the pte is locked + * over the call to page_add_new_anon_rmap. + */ + VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, + folio); + VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), + page); +} + +/** + * page_add_anon_rmap - add pte mapping to an anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * @flags: the rmap flags + * + * The caller needs to hold the pte lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting, + * and to ensure that PageAnon is not being upgraded racily to PageKsm + * (but PageKsm is never downgraded to PageAnon). + */ +void page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, rmap_t flags) +{ + bool compound = flags & RMAP_COMPOUND; + bool first; + + if (unlikely(PageKsm(page))) + lock_page_memcg(page); + else + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (compound) { + atomic_t *mapcount; + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + mapcount = compound_mapcount_ptr(page); + first = atomic_inc_and_test(mapcount); + } else { + first = atomic_inc_and_test(&page->_mapcount); + } + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); + VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); + + if (first) { + int nr = compound ? thp_nr_pages(page) : 1; + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption + * disabled. + */ + if (compound) + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); + } + + if (unlikely(PageKsm(page))) + unlock_page_memcg(page); + + /* address might be in next vma when migration races vma_adjust */ + else if (first) + __page_set_anon_rmap(page, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + else + __page_check_anon_rmap(page, vma, address); + + mlock_vma_page(page, vma, compound); +} + +/** + * page_add_new_anon_rmap - add mapping to a new anonymous page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @address: the user virtual address mapped + * + * If it's a compound page, it is accounted as a compound page. As the page + * is new, it's assume to get mapped exclusively by a single process. + * + * Same as page_add_anon_rmap but must only be called on *new* pages. + * This means the inc-and-test can be bypassed. + * Page does not have to be locked. + */ +void page_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + const bool compound = PageCompound(page); + int nr = compound ? thp_nr_pages(page) : 1; + + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + __SetPageSwapBacked(page); + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* increment count (starts at -1) */ + atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); + + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + } else { + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + } + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); + __page_set_anon_rmap(page, vma, address, 1); +} + +/** + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_add_file_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + int i, nr = 0; + + VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); + lock_page_memcg(page); + if (compound && PageTransHuge(page)) { + int nr_pages = thp_nr_pages(page); + + for (i = 0; i < nr_pages; i++) { + if (atomic_inc_and_test(&page[i]._mapcount)) + nr++; + } + if (!atomic_inc_and_test(compound_mapcount_ptr(page))) + goto out; + + /* + * It is racy to ClearPageDoubleMap in page_remove_file_rmap(); + * but page lock is held by all page_add_file_rmap() compound + * callers, and SetPageDoubleMap below warns if !PageLocked: + * so here is a place that DoubleMap can be safely cleared. + */ + VM_WARN_ON_ONCE(!PageLocked(page)); + if (nr == nr_pages && PageDoubleMap(page)) + ClearPageDoubleMap(page); + + if (PageSwapBacked(page)) + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + nr_pages); + else + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + nr_pages); + } else { + if (PageTransCompound(page) && page_mapping(page)) { + VM_WARN_ON_ONCE(!PageLocked(page)); + SetPageDoubleMap(compound_head(page)); + } + if (atomic_inc_and_test(&page->_mapcount)) + nr++; + } +out: + if (nr) + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); + unlock_page_memcg(page); + + mlock_vma_page(page, vma, compound); +} + +static void page_remove_file_rmap(struct page *page, bool compound) +{ + int i, nr = 0; + + VM_BUG_ON_PAGE(compound && !PageHead(page), page); + + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) { + /* hugetlb pages are always mapped with pmds */ + atomic_dec(compound_mapcount_ptr(page)); + return; + } + + /* page still mapped by someone else? */ + if (compound && PageTransHuge(page)) { + int nr_pages = thp_nr_pages(page); + + for (i = 0; i < nr_pages; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + goto out; + if (PageSwapBacked(page)) + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + -nr_pages); + else + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + -nr_pages); + } else { + if (atomic_add_negative(-1, &page->_mapcount)) + nr++; + } +out: + if (nr) + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); +} + +static void page_remove_anon_compound_rmap(struct page *page) +{ + int i, nr; + + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); + + if (TestClearPageDoubleMap(page)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * them are still mapped. + */ + for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + + /* + * Queue the page for deferred split if at least one small + * page of the compound page is unmapped, but at least one + * small page is still mapped. + */ + if (nr && nr < thp_nr_pages(page)) + deferred_split_huge_page(page); + } else { + nr = thp_nr_pages(page); + } + + if (nr) + __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); +} + +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed + * @compound: uncharge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) +{ + lock_page_memcg(page); + + if (!PageAnon(page)) { + page_remove_file_rmap(page, compound); + goto out; + } + + if (compound) { + page_remove_anon_compound_rmap(page); + goto out; + } + + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + goto out; + + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. + */ + __dec_lruvec_page_state(page, NR_ANON_MAPPED); + + if (PageTransCompound(page)) + deferred_split_huge_page(compound_head(page)); + + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_unref_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ +out: + unlock_page_memcg(page); + + munlock_vma_page(page, vma, compound); +} + +/* + * @arg: enum ttu_flags will be passed to this argument + */ +static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + pte_t pteval; + struct page *subpage; + bool anon_exclusive, ret = true; + struct mmu_notifier_range range; + enum ttu_flags flags = (enum ttu_flags)(long)arg; + + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return before page_mapped() has become false, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address, false, folio); + + /* + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the folio can not be freed in this function as call of + * try_to_unmap() must hold a reference on the folio. + */ + range.end = vma_address_end(&pvmw); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, range.end); + if (folio_test_hugetlb(folio)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); + } + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_FOLIO(!pvmw.pte, folio); + + /* + * If the folio is in an mlock()d vma, we must not swap it out. + */ + if (!(flags & TTU_IGNORE_MLOCK) && + (vma->vm_flags & VM_LOCKED)) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma, false); + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); + address = pvmw.address; + anon_exclusive = folio_test_anon(folio) && + PageAnonExclusive(subpage); + + if (folio_test_hugetlb(folio)) { + bool anon = folio_test_anon(folio); + + /* + * The try_to_unmap() is only passed a hugetlb page + * in the case where the hugetlb page is poisoned. + */ + VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); + /* + * huge_pmd_unshare may unmap an entire PMD page. + * There is no way of knowing exactly which PMDs may + * be cached for this mm, so we must flush them all. + * start/end were already adjusted above to cover this + * range. + */ + flush_cache_range(vma, range.start, range.end); + + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and fail + * if unsuccessful. + */ + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); + } + pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); + } else { + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + /* Nuke the page table entry. */ + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially + * a remote CPU could still be writing to the folio. + * If the entry was previously clean then the + * architecture must guarantee that a clear->dirty + * transition on a cached TLB entry is written through + * and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pvmw.pte); + + set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pvmw.pte); + } + } + + /* + * Now the pte is cleared. If this pte was uffd-wp armed, + * we may want to replace a none pte with a marker pte if + * it's file-backed, so we don't lose the tracking info. + */ + pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); + + /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + + if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); + if (folio_test_hugetlb(folio)) { + hugetlb_count_sub(folio_nr_pages(folio), mm); + set_huge_pte_at(mm, address, pvmw.pte, pteval); + } else { + dec_mm_counter(mm, mm_counter(&folio->page)); + set_pte_at(mm, address, pvmw.pte, pteval); + } + + } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + * A future reference will then fault in a new zero + * page. When userfaultfd is active, we must not drop + * this page though, as its main user (postcopy + * migration) will not expect userfaults on already + * copied pages. + */ + dec_mm_counter(mm, mm_counter(&folio->page)); + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else if (folio_test_anon(folio)) { + swp_entry_t entry = { .val = page_private(subpage) }; + pte_t swp_pte; + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + if (unlikely(folio_test_swapbacked(folio) != + folio_test_swapcache(folio))) { + WARN_ON_ONCE(1); + ret = false; + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* MADV_FREE page check */ + if (!folio_test_swapbacked(folio)) { + int ref_count, map_count; + + /* + * Synchronize with gup_pte_range(): + * - clear PTE; barrier; read refcount + * - inc refcount; barrier; read PTE + */ + smp_mb(); + + ref_count = folio_ref_count(folio); + map_count = folio_mapcount(folio); + + /* + * Order reads for page refcount and dirty flag + * (see comments in __remove_mapping()). + */ + smp_rmb(); + + /* + * The only page refs must be one from isolation + * plus the rmap(s) (dropped by discard:). + */ + if (ref_count == 1 + map_count && + !folio_test_dirty(folio)) { + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, + address, address + PAGE_SIZE); + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } + + /* + * If the folio was redirtied, it cannot be + * discarded. Remap the page to page table. + */ + set_pte_at(mm, address, pvmw.pte, pteval); + folio_set_swapbacked(folio); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + if (arch_unmap_one(mm, vma, address, pteval) < 0) { + swap_free(entry); + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* See page_try_share_anon_rmap(): clear PTE first. */ + if (anon_exclusive && + page_try_share_anon_rmap(subpage)) { + swap_free(entry); + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + /* + * Note: We *don't* remember if the page was mapped + * exclusively in the swap pte if the architecture + * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In + * that case, swapin code has to re-determine that + * manually and might detect the page as possibly + * shared, for example, if there are other references on + * the page or if the page is under writeback. We made + * sure that there are no GUP pins on the page that + * would rely on it, so for GUP pins this is fine. + */ + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); + swp_pte = swp_entry_to_pte(entry); + if (anon_exclusive) + swp_pte = pte_swp_mkexclusive(swp_pte); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + set_pte_at(mm, address, pvmw.pte, swp_pte); + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else { + /* + * This is a locked file-backed folio, + * so it cannot be removed from the page + * cache and replaced by a new folio before + * mmu_notifier_invalidate_range_end, so no + * concurrent thread might update its page table + * to point at a new folio while a device is + * still using this folio. + * + * See Documentation/mm/mmu_notifier.rst + */ + dec_mm_counter(mm, mm_counter_file(&folio->page)); + } +discard: + /* + * No need to call mmu_notifier_invalidate_range() it has be + * done above for all cases requiring it to happen under page + * table lock before mmu_notifier_invalidate_range_end() + * + * See Documentation/mm/mmu_notifier.rst + */ + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain_local(); + folio_put(folio); + } + + mmu_notifier_invalidate_range_end(&range); + + return ret; +} + +static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) +{ + return vma_is_temporary_stack(vma); +} + +static int page_not_mapped(struct folio *folio) +{ + return !folio_mapped(folio); +} + +/** + * try_to_unmap - Try to remove all page table mappings to a folio. + * @folio: The folio to unmap. + * @flags: action and flags + * + * Tries to remove all the page table entries which are mapping this + * folio. It is the caller's responsibility to check if the folio is + * still mapped if needed (use TTU_SYNC to prevent accounting races). + * + * Context: Caller must hold the folio lock. + */ +void try_to_unmap(struct folio *folio, enum ttu_flags flags) +{ + struct rmap_walk_control rwc = { + .rmap_one = try_to_unmap_one, + .arg = (void *)flags, + .done = page_not_mapped, + .anon_lock = folio_lock_anon_vma_read, + }; + + if (flags & TTU_RMAP_LOCKED) + rmap_walk_locked(folio, &rwc); + else + rmap_walk(folio, &rwc); +} + +/* + * @arg: enum ttu_flags will be passed to this argument. + * + * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs + * containing migration entries. + */ +static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + pte_t pteval; + struct page *subpage; + bool anon_exclusive, ret = true; + struct mmu_notifier_range range; + enum ttu_flags flags = (enum ttu_flags)(long)arg; + + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_migrate() may return before page_mapped() has become false, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + + /* + * unmap_page() in mm/huge_memory.c is the only user of migration with + * TTU_SPLIT_HUGE_PMD and it wants to freeze. + */ + if (flags & TTU_SPLIT_HUGE_PMD) + split_huge_pmd_address(vma, address, true, folio); + + /* + * For THP, we have to assume the worse case ie pmd for invalidation. + * For hugetlb, it could be much worse if we need to do pud + * invalidation in the case of pmd sharing. + * + * Note that the page can not be free in this function as call of + * try_to_unmap() must hold a reference on the page. + */ + range.end = vma_address_end(&pvmw); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, range.end); + if (folio_test_hugetlb(folio)) { + /* + * If sharing is possible, start and end will be adjusted + * accordingly. + */ + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); + } + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(&pvmw)) { +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + subpage = folio_page(folio, + pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); + VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || + !folio_test_pmd_mappable(folio), folio); + + if (set_pmd_migration_entry(&pvmw, subpage)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + continue; + } +#endif + + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_FOLIO(!pvmw.pte, folio); + + if (folio_is_zone_device(folio)) { + /* + * Our PTE is a non-present device exclusive entry and + * calculating the subpage as for the common case would + * result in an invalid pointer. + * + * Since only PAGE_SIZE pages can currently be + * migrated, just set it to page. This will need to be + * changed when hugepage migrations to device private + * memory are supported. + */ + VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); + subpage = &folio->page; + } else { + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); + } + address = pvmw.address; + anon_exclusive = folio_test_anon(folio) && + PageAnonExclusive(subpage); + + if (folio_test_hugetlb(folio)) { + bool anon = folio_test_anon(folio); + + /* + * huge_pmd_unshare may unmap an entire PMD page. + * There is no way of knowing exactly which PMDs may + * be cached for this mm, so we must flush them all. + * start/end were already adjusted above to cover this + * range. + */ + flush_cache_range(vma, range.start, range.end); + + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and + * fail if unsuccessful. + */ + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); + } + /* Nuke the hugetlb page table entry */ + pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); + } else { + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + /* Nuke the page table entry. */ + pteval = ptep_clear_flush(vma, address, pvmw.pte); + } + + /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + + if (folio_is_device_private(folio)) { + unsigned long pfn = folio_pfn(folio); + swp_entry_t entry; + pte_t swp_pte; + + if (anon_exclusive) + BUG_ON(page_try_share_anon_rmap(subpage)); + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = pte_to_swp_entry(pteval); + if (is_writable_device_private_entry(entry)) + entry = make_writable_migration_entry(pfn); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry(pfn); + else + entry = make_readable_migration_entry(pfn); + swp_pte = swp_entry_to_pte(entry); + + /* + * pteval maps a zone device page and is therefore + * a swap pte. + */ + if (pte_swp_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + trace_set_migration_pte(pvmw.address, pte_val(swp_pte), + compound_order(&folio->page)); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ + } else if (PageHWPoison(subpage)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); + if (folio_test_hugetlb(folio)) { + hugetlb_count_sub(folio_nr_pages(folio), mm); + set_huge_pte_at(mm, address, pvmw.pte, pteval); + } else { + dec_mm_counter(mm, mm_counter(&folio->page)); + set_pte_at(mm, address, pvmw.pte, pteval); + } + + } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + * A future reference will then fault in a new zero + * page. When userfaultfd is active, we must not drop + * this page though, as its main user (postcopy + * migration) will not expect userfaults on already + * copied pages. + */ + dec_mm_counter(mm, mm_counter(&folio->page)); + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else { + swp_entry_t entry; + pte_t swp_pte; + + if (arch_unmap_one(mm, vma, address, pteval) < 0) { + if (folio_test_hugetlb(folio)) + set_huge_pte_at(mm, address, pvmw.pte, pteval); + else + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && + !anon_exclusive, subpage); + + /* See page_try_share_anon_rmap(): clear PTE first. */ + if (anon_exclusive && + page_try_share_anon_rmap(subpage)) { + if (folio_test_hugetlb(folio)) + set_huge_pte_at(mm, address, pvmw.pte, pteval); + else + set_pte_at(mm, address, pvmw.pte, pteval); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + if (pte_write(pteval)) + entry = make_writable_migration_entry( + page_to_pfn(subpage)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry( + page_to_pfn(subpage)); + else + entry = make_readable_migration_entry( + page_to_pfn(subpage)); + if (pte_young(pteval)) + entry = make_migration_entry_young(entry); + if (pte_dirty(pteval)) + entry = make_migration_entry_dirty(entry); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + if (folio_test_hugetlb(folio)) + set_huge_pte_at(mm, address, pvmw.pte, swp_pte); + else + set_pte_at(mm, address, pvmw.pte, swp_pte); + trace_set_migration_pte(address, pte_val(swp_pte), + compound_order(&folio->page)); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ + } + + /* + * No need to call mmu_notifier_invalidate_range() it has be + * done above for all cases requiring it to happen under page + * table lock before mmu_notifier_invalidate_range_end() + * + * See Documentation/mm/mmu_notifier.rst + */ + page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); + if (vma->vm_flags & VM_LOCKED) + mlock_page_drain_local(); + folio_put(folio); + } + + mmu_notifier_invalidate_range_end(&range); + + return ret; +} + +/** + * try_to_migrate - try to replace all page table mappings with swap entries + * @folio: the folio to replace page table entries for + * @flags: action and flags + * + * Tries to remove all the page table entries which are mapping this folio and + * replace them with special swap entries. Caller must hold the folio lock. + */ +void try_to_migrate(struct folio *folio, enum ttu_flags flags) +{ + struct rmap_walk_control rwc = { + .rmap_one = try_to_migrate_one, + .arg = (void *)flags, + .done = page_not_mapped, + .anon_lock = folio_lock_anon_vma_read, + }; + + /* + * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and + * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags. + */ + if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | + TTU_SYNC))) + return; + + if (folio_is_zone_device(folio) && + (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) + return; + + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (!folio_test_ksm(folio) && folio_test_anon(folio)) + rwc.invalid_vma = invalid_migration_vma; + + if (flags & TTU_RMAP_LOCKED) + rmap_walk_locked(folio, &rwc); + else + rmap_walk(folio, &rwc); +} + +#ifdef CONFIG_DEVICE_PRIVATE +struct make_exclusive_args { + struct mm_struct *mm; + unsigned long address; + void *owner; + bool valid; +}; + +static bool page_make_device_exclusive_one(struct folio *folio, + struct vm_area_struct *vma, unsigned long address, void *priv) +{ + struct mm_struct *mm = vma->vm_mm; + DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); + struct make_exclusive_args *args = priv; + pte_t pteval; + struct page *subpage; + bool ret = true; + struct mmu_notifier_range range; + swp_entry_t entry; + pte_t swp_pte; + + mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, + vma->vm_mm, address, min(vma->vm_end, + address + folio_size(folio)), + args->owner); + mmu_notifier_invalidate_range_start(&range); + + while (page_vma_mapped_walk(&pvmw)) { + /* Unexpected PMD-mapped THP? */ + VM_BUG_ON_FOLIO(!pvmw.pte, folio); + + if (!pte_present(*pvmw.pte)) { + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + subpage = folio_page(folio, + pte_pfn(*pvmw.pte) - folio_pfn(folio)); + address = pvmw.address; + + /* Nuke the page table entry. */ + flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); + pteval = ptep_clear_flush(vma, address, pvmw.pte); + + /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pteval)) + folio_mark_dirty(folio); + + /* + * Check that our target page is still mapped at the expected + * address. + */ + if (args->mm == mm && args->address == address && + pte_write(pteval)) + args->valid = true; + + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + if (pte_write(pteval)) + entry = make_writable_device_exclusive_entry( + page_to_pfn(subpage)); + else + entry = make_readable_device_exclusive_entry( + page_to_pfn(subpage)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + + set_pte_at(mm, address, pvmw.pte, swp_pte); + + /* + * There is a reference on the page for the swap entry which has + * been removed, so shouldn't take another. + */ + page_remove_rmap(subpage, vma, false); + } + + mmu_notifier_invalidate_range_end(&range); + + return ret; +} + +/** + * folio_make_device_exclusive - Mark the folio exclusively owned by a device. + * @folio: The folio to replace page table entries for. + * @mm: The mm_struct where the folio is expected to be mapped. + * @address: Address where the folio is expected to be mapped. + * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks + * + * Tries to remove all the page table entries which are mapping this + * folio and replace them with special device exclusive swap entries to + * grant a device exclusive access to the folio. + * + * Context: Caller must hold the folio lock. + * Return: false if the page is still mapped, or if it could not be unmapped + * from the expected address. Otherwise returns true (success). + */ +static bool folio_make_device_exclusive(struct folio *folio, + struct mm_struct *mm, unsigned long address, void *owner) +{ + struct make_exclusive_args args = { + .mm = mm, + .address = address, + .owner = owner, + .valid = false, + }; + struct rmap_walk_control rwc = { + .rmap_one = page_make_device_exclusive_one, + .done = page_not_mapped, + .anon_lock = folio_lock_anon_vma_read, + .arg = &args, + }; + + /* + * Restrict to anonymous folios for now to avoid potential writeback + * issues. + */ + if (!folio_test_anon(folio)) + return false; + + rmap_walk(folio, &rwc); + + return args.valid && !folio_mapcount(folio); +} + +/** + * make_device_exclusive_range() - Mark a range for exclusive use by a device + * @mm: mm_struct of associated target process + * @start: start of the region to mark for exclusive device access + * @end: end address of region + * @pages: returns the pages which were successfully marked for exclusive access + * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering + * + * Returns: number of pages found in the range by GUP. A page is marked for + * exclusive access only if the page pointer is non-NULL. + * + * This function finds ptes mapping page(s) to the given address range, locks + * them and replaces mappings with special swap entries preventing userspace CPU + * access. On fault these entries are replaced with the original mapping after + * calling MMU notifiers. + * + * A driver using this to program access from a device must use a mmu notifier + * critical section to hold a device specific lock during programming. Once + * programming is complete it should drop the page lock and reference after + * which point CPU access to the page will revoke the exclusive access. + */ +int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct page **pages, + void *owner) +{ + long npages = (end - start) >> PAGE_SHIFT; + long i; + + npages = get_user_pages_remote(mm, start, npages, + FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, + pages, NULL, NULL); + if (npages < 0) + return npages; + + for (i = 0; i < npages; i++, start += PAGE_SIZE) { + struct folio *folio = page_folio(pages[i]); + if (PageTail(pages[i]) || !folio_trylock(folio)) { + folio_put(folio); + pages[i] = NULL; + continue; + } + + if (!folio_make_device_exclusive(folio, mm, start, owner)) { + folio_unlock(folio); + folio_put(folio); + pages[i] = NULL; + } + } + + return npages; +} +EXPORT_SYMBOL_GPL(make_device_exclusive_range); +#endif + +void __put_anon_vma(struct anon_vma *anon_vma) +{ + struct anon_vma *root = anon_vma->root; + + anon_vma_free(anon_vma); + if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + anon_vma_free(root); +} + +static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, + struct rmap_walk_control *rwc) +{ + struct anon_vma *anon_vma; + + if (rwc->anon_lock) + return rwc->anon_lock(folio, rwc); + + /* + * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_lock. Users without mmap_lock are required to + * take a reference count to prevent the anon_vma disappearing + */ + anon_vma = folio_anon_vma(folio); + if (!anon_vma) + return NULL; + + if (anon_vma_trylock_read(anon_vma)) + goto out; + + if (rwc->try_lock) { + anon_vma = NULL; + rwc->contended = true; + goto out; + } + + anon_vma_lock_read(anon_vma); +out: + return anon_vma; +} + +/* + * rmap_walk_anon - do something to anonymous page using the object-based + * rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + */ +static void rmap_walk_anon(struct folio *folio, + struct rmap_walk_control *rwc, bool locked) +{ + struct anon_vma *anon_vma; + pgoff_t pgoff_start, pgoff_end; + struct anon_vma_chain *avc; + + if (locked) { + anon_vma = folio_anon_vma(folio); + /* anon_vma disappear under us? */ + VM_BUG_ON_FOLIO(!anon_vma, folio); + } else { + anon_vma = rmap_walk_anon_lock(folio, rwc); + } + if (!anon_vma) + return; + + pgoff_start = folio_pgoff(folio); + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff_start, pgoff_end) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(&folio->page, vma); + + VM_BUG_ON_VMA(address == -EFAULT, vma); + cond_resched(); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) + break; + if (rwc->done && rwc->done(folio)) + break; + } + + if (!locked) + anon_vma_unlock_read(anon_vma); +} + +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @page: the page to be handled + * @rwc: control variable according to each walk type + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + */ +static void rmap_walk_file(struct folio *folio, + struct rmap_walk_control *rwc, bool locked) +{ + struct address_space *mapping = folio_mapping(folio); + pgoff_t pgoff_start, pgoff_end; + struct vm_area_struct *vma; + + /* + * The page lock not only makes sure that page->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the + * structure at mapping cannot be freed and reused yet, + * so we can safely take mapping->i_mmap_rwsem. + */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (!mapping) + return; + + pgoff_start = folio_pgoff(folio); + pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; + if (!locked) { + if (i_mmap_trylock_read(mapping)) + goto lookup; + + if (rwc->try_lock) { + rwc->contended = true; + return; + } + + i_mmap_lock_read(mapping); + } +lookup: + vma_interval_tree_foreach(vma, &mapping->i_mmap, + pgoff_start, pgoff_end) { + unsigned long address = vma_address(&folio->page, vma); + + VM_BUG_ON_VMA(address == -EFAULT, vma); + cond_resched(); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) + continue; + + if (!rwc->rmap_one(folio, vma, address, rwc->arg)) + goto done; + if (rwc->done && rwc->done(folio)) + goto done; + } + +done: + if (!locked) + i_mmap_unlock_read(mapping); +} + +void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) +{ + if (unlikely(folio_test_ksm(folio))) + rmap_walk_ksm(folio, rwc); + else if (folio_test_anon(folio)) + rmap_walk_anon(folio, rwc, false); + else + rmap_walk_file(folio, rwc, false); +} + +/* Like rmap_walk, but caller holds relevant rmap lock */ +void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) +{ + /* no ksm support for now */ + VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); + if (folio_test_anon(folio)) + rmap_walk_anon(folio, rwc, true); + else + rmap_walk_file(folio, rwc, true); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * The following two functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + * + * RMAP_COMPOUND is ignored. + */ +void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, rmap_t flags) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + + BUG_ON(!PageLocked(page)); + BUG_ON(!anon_vma); + /* address might be in next vma when migration races vma_adjust */ + first = atomic_inc_and_test(compound_mapcount_ptr(page)); + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); + VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); + if (first) + __page_set_anon_rmap(page, vma, address, + !!(flags & RMAP_EXCLUSIVE)); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); + + __page_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/rodata_test.c b/mm/rodata_test.c new file mode 100644 index 000000000..6d7834369 --- /dev/null +++ b/mm/rodata_test.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * rodata_test.c: functional test for mark_rodata_ro function + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven + */ +#define pr_fmt(fmt) "rodata_test: " fmt + +#include +#include +#include +#include + +static const int rodata_test_data = 0xC3; + +void rodata_test(void) +{ + int zero = 0; + + /* test 1: read the value */ + /* If this test fails, some previous testrun has clobbered the state */ + if (!rodata_test_data) { + pr_err("test 1 fails (start data)\n"); + return; + } + + /* test 2: write to the variable; this should fault */ + if (!copy_to_kernel_nofault((void *)&rodata_test_data, + (void *)&zero, sizeof(zero))) { + pr_err("test data was not read only\n"); + return; + } + + /* test 3: check the value hasn't changed */ + if (rodata_test_data == zero) { + pr_err("test data was changed\n"); + return; + } + + /* test 4: check if the rodata section is PAGE_SIZE aligned */ + if (!PAGE_ALIGNED(__start_rodata)) { + pr_err("start of .rodata is not page size aligned\n"); + return; + } + if (!PAGE_ALIGNED(__end_rodata)) { + pr_err("end of .rodata is not page size aligned\n"); + return; + } + + pr_info("all tests were successful\n"); +} diff --git a/mm/secretmem.c b/mm/secretmem.c new file mode 100644 index 000000000..04c3ac944 --- /dev/null +++ b/mm/secretmem.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "secretmem: " fmt + +/* + * Define mode and flag masks to allow validation of the system call + * parameters. + */ +#define SECRETMEM_MODE_MASK (0x0) +#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK + +static bool secretmem_enable __ro_after_init; +module_param_named(enable, secretmem_enable, bool, 0400); +MODULE_PARM_DESC(secretmem_enable, + "Enable secretmem and memfd_secret(2) system call"); + +static atomic_t secretmem_users; + +bool secretmem_active(void) +{ + return !!atomic_read(&secretmem_users); +} + +static vm_fault_t secretmem_fault(struct vm_fault *vmf) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct inode *inode = file_inode(vmf->vma->vm_file); + pgoff_t offset = vmf->pgoff; + gfp_t gfp = vmf->gfp_mask; + unsigned long addr; + struct page *page; + vm_fault_t ret; + int err; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return vmf_error(-EINVAL); + + filemap_invalidate_lock_shared(mapping); + +retry: + page = find_lock_page(mapping, offset); + if (!page) { + page = alloc_page(gfp | __GFP_ZERO); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + + err = set_direct_map_invalid_noflush(page); + if (err) { + put_page(page); + ret = vmf_error(err); + goto out; + } + + __SetPageUptodate(page); + err = add_to_page_cache_lru(page, mapping, offset, gfp); + if (unlikely(err)) { + put_page(page); + /* + * If a split of large page was required, it + * already happened when we marked the page invalid + * which guarantees that this call won't fail + */ + set_direct_map_default_noflush(page); + if (err == -EEXIST) + goto retry; + + ret = vmf_error(err); + goto out; + } + + addr = (unsigned long)page_address(page); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } + + vmf->page = page; + ret = VM_FAULT_LOCKED; + +out: + filemap_invalidate_unlock_shared(mapping); + return ret; +} + +static const struct vm_operations_struct secretmem_vm_ops = { + .fault = secretmem_fault, +}; + +static int secretmem_release(struct inode *inode, struct file *file) +{ + atomic_dec(&secretmem_users); + return 0; +} + +static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long len = vma->vm_end - vma->vm_start; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + return -EINVAL; + + if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + return -EAGAIN; + + vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vma->vm_ops = &secretmem_vm_ops; + + return 0; +} + +bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &secretmem_vm_ops; +} + +static const struct file_operations secretmem_fops = { + .release = secretmem_release, + .mmap = secretmem_mmap, +}; + +static int secretmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + return -EBUSY; +} + +static void secretmem_free_folio(struct folio *folio) +{ + set_direct_map_default_noflush(&folio->page); + folio_zero_segment(folio, 0, folio_size(folio)); +} + +const struct address_space_operations secretmem_aops = { + .dirty_folio = noop_dirty_folio, + .free_folio = secretmem_free_folio, + .migrate_folio = secretmem_migrate_folio, +}; + +static int secretmem_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + struct address_space *mapping = inode->i_mapping; + unsigned int ia_valid = iattr->ia_valid; + int ret; + + filemap_invalidate_lock(mapping); + + if ((ia_valid & ATTR_SIZE) && inode->i_size) + ret = -EINVAL; + else + ret = simple_setattr(mnt_userns, dentry, iattr); + + filemap_invalidate_unlock(mapping); + + return ret; +} + +static const struct inode_operations secretmem_iops = { + .setattr = secretmem_setattr, +}; + +static struct vfsmount *secretmem_mnt; + +static struct file *secretmem_file_create(unsigned long flags) +{ + struct file *file = ERR_PTR(-ENOMEM); + struct inode *inode; + const char *anon_name = "[secretmem]"; + const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); + int err; + + inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + err = security_inode_init_security_anon(inode, &qname, NULL); + if (err) { + file = ERR_PTR(err); + goto err_free_inode; + } + + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", + O_RDWR, &secretmem_fops); + if (IS_ERR(file)) + goto err_free_inode; + + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + + inode->i_op = &secretmem_iops; + inode->i_mapping->a_ops = &secretmem_aops; + + /* pretend we are a normal file with zero size */ + inode->i_mode |= S_IFREG; + inode->i_size = 0; + + return file; + +err_free_inode: + iput(inode); + return file; +} + +SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) +{ + struct file *file; + int fd, err; + + /* make sure local flags do not confict with global fcntl.h */ + BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); + + if (!secretmem_enable) + return -ENOSYS; + + if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) + return -EINVAL; + if (atomic_read(&secretmem_users) < 0) + return -ENFILE; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + file = secretmem_file_create(flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_put_fd; + } + + file->f_flags |= O_LARGEFILE; + + atomic_inc(&secretmem_users); + fd_install(fd, file); + return fd; + +err_put_fd: + put_unused_fd(fd); + return err; +} + +static int secretmem_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type secretmem_fs = { + .name = "secretmem", + .init_fs_context = secretmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int __init secretmem_init(void) +{ + if (!secretmem_enable) + return 0; + + secretmem_mnt = kern_mount(&secretmem_fs); + if (IS_ERR(secretmem_mnt)) + return PTR_ERR(secretmem_mnt); + + /* prevent secretmem mappings from ever getting PROT_EXEC */ + secretmem_mnt->mnt_flags |= MNT_NOEXEC; + + return 0; +} +fs_initcall(secretmem_init); diff --git a/mm/shmem.c b/mm/shmem.c new file mode 100644 index 000000000..f7c08e169 --- /dev/null +++ b/mm/shmem.c @@ -0,0 +1,4376 @@ +/* + * Resizable virtual memory filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * 2000-2001 Christoph Rohland + * 2000-2001 SAP AG + * 2002 Red Hat Inc. + * Copyright (C) 2002-2011 Hugh Dickins. + * Copyright (C) 2011 Google Inc. + * Copyright (C) 2002-2005 VERITAS Software Corporation. + * Copyright (C) 2004 Andi Kleen, SuSE Labs + * + * Extended attribute support for tmpfs: + * Copyright (c) 2004, Luke Kenneth Casson Leighton + * Copyright (c) 2004 Red Hat, Inc., James Morris + * + * tiny-shmem: + * Copyright (c) 2004, 2008 Matt Mackall + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "swap.h" + +static struct vfsmount *shm_mnt; + +#ifdef CONFIG_SHMEM +/* + * This virtual memory filesystem is heavily based on the ramfs. It + * extends ramfs by the ability to use swap and honor resource limits + * which makes it a completely usable filesystem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +#define BLOCKS_PER_PAGE (PAGE_SIZE/512) +#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) + +/* Pretend that each entry is of this size in directory's i_size */ +#define BOGO_DIRENT_SIZE 20 + +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ +#define SHORT_SYMLINK_LEN 128 + +/* + * shmem_fallocate communicates with shmem_fault or shmem_writepage via + * inode->i_private (with i_rwsem making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ + pgoff_t nr_falloced; /* how many new pages have been fallocated */ + pgoff_t nr_unswapped; /* how often writepage refused to swap out */ +}; + +struct shmem_options { + unsigned long long blocks; + unsigned long long inodes; + struct mempolicy *mpol; + kuid_t uid; + kgid_t gid; + umode_t mode; + bool full_inums; + int huge; + int seen; +#define SHMEM_SEEN_BLOCKS 1 +#define SHMEM_SEEN_INODES 2 +#define SHMEM_SEEN_HUGE 4 +#define SHMEM_SEEN_INUMS 8 +}; + +#ifdef CONFIG_TMPFS +static unsigned long shmem_default_max_blocks(void) +{ + return totalram_pages() / 2; +} + +static unsigned long shmem_default_max_inodes(void) +{ + unsigned long nr_pages = totalram_pages(); + + return min(nr_pages - totalhigh_pages(), nr_pages / 2); +} +#endif + +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type); + +static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * shmem_file_setup pre-accounts the whole fixed size of a VM object, + * for shared memory and for shared anonymous (/dev/zero) mappings + * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1), + * consistent with the pre-accounting of private mappings ... + */ +static inline int shmem_acct_size(unsigned long flags, loff_t size) +{ + return (flags & VM_NORESERVE) ? + 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); +} + +static inline void shmem_unacct_size(unsigned long flags, loff_t size) +{ + if (!(flags & VM_NORESERVE)) + vm_unacct_memory(VM_ACCT(size)); +} + +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + +/* + * ... whereas tmpfs objects are accounted incrementally as + * pages are allocated, in order to allow large sparse files. + * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. + */ +static inline int shmem_acct_block(unsigned long flags, long pages) +{ + if (!(flags & VM_NORESERVE)) + return 0; + + return security_vm_enough_memory_mm(current->mm, + pages * VM_ACCT(PAGE_SIZE)); +} + +static inline void shmem_unacct_blocks(unsigned long flags, long pages) +{ + if (flags & VM_NORESERVE) + vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); +} + +static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (shmem_acct_block(info->flags, pages)) + return false; + + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } + + return true; + +unacct: + shmem_unacct_blocks(info->flags, pages); + return false; +} + +static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); +} + +static const struct super_operations shmem_ops; +const struct address_space_operations shmem_aops; +static const struct file_operations shmem_file_operations; +static const struct inode_operations shmem_inode_operations; +static const struct inode_operations shmem_dir_inode_operations; +static const struct inode_operations shmem_special_inode_operations; +static const struct vm_operations_struct shmem_vm_ops; +static struct file_system_type shmem_fs_type; + +bool vma_is_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_vm_ops; +} + +static LIST_HEAD(shmem_swaplist); +static DEFINE_MUTEX(shmem_swaplist_mutex); + +/* + * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and + * produces a novel ino for the newly allocated inode. + * + * It may also be called when making a hard link to permit the space needed by + * each dentry. However, in that case, no new inode number is needed since that + * internally draws from another pool of inode numbers (currently global + * get_next_ino()). This case is indicated by passing NULL as inop. + */ +#define SHMEM_INO_BATCH 1024 +static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { + raw_spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + } + if (inop) { + ino = sbinfo->next_ino++; + if (unlikely(is_zero_ino(ino))) + ino = sbinfo->next_ino++; + if (unlikely(!sbinfo->full_inums && + ino > UINT_MAX)) { + /* + * Emulate get_next_ino uint wraparound for + * compatibility + */ + if (IS_ENABLED(CONFIG_64BIT)) + pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", + __func__, MINOR(sb->s_dev)); + sbinfo->next_ino = 1; + ino = sbinfo->next_ino++; + } + *inop = ino; + } + raw_spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it + * doesn't hold stat_lock in shmem_reserve_inode since + * max_inodes is always 0, and is called from potentially + * unknown contexts. As such, use a per-cpu batched allocator + * which doesn't require the per-sb stat_lock unless we are at + * the batch boundary. + * + * We don't need to worry about inode{32,64} since SB_KERNMOUNT + * shmem mounts are not exposed to userspace, so we don't need + * to worry about things like glibc compatibility. + */ + ino_t *next_ino; + + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { + raw_spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; + raw_spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } + *inop = ino; + *next_ino = ++ino; + put_cpu(); + } + + return 0; +} + +static void shmem_free_inode(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + raw_spin_unlock(&sbinfo->stat_lock); + } +} + +/** + * shmem_recalc_inode - recalculate the block usage of an inode + * @inode: inode to recalc + * + * We have to calculate the free blocks since the mm can drop + * undirtied hole pages behind our back. + * + * But normally info->alloced == inode->i_mapping->nrpages + info->swapped + * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) + * + * It has to be called with the spinlock held. + */ +static void shmem_recalc_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + long freed; + + freed = info->alloced - info->swapped - inode->i_mapping->nrpages; + if (freed > 0) { + info->alloced -= freed; + inode->i_blocks -= freed * BLOCKS_PER_PAGE; + shmem_inode_unacct_blocks(inode, freed); + } +} + +bool shmem_charge(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long flags; + + if (!shmem_inode_acct_block(inode, pages)) + return false; + + /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ + inode->i_mapping->nrpages += pages; + + spin_lock_irqsave(&info->lock, flags); + info->alloced += pages; + inode->i_blocks += pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + + return true; +} + +void shmem_uncharge(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + unsigned long flags; + + /* nrpages adjustment done by __filemap_remove_folio() or caller */ + + spin_lock_irqsave(&info->lock, flags); + info->alloced -= pages; + inode->i_blocks -= pages * BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irqrestore(&info->lock, flags); + + shmem_inode_unacct_blocks(inode, pages); +} + +/* + * Replace item expected in xarray by a new item, while holding xa_lock. + */ +static int shmem_replace_entry(struct address_space *mapping, + pgoff_t index, void *expected, void *replacement) +{ + XA_STATE(xas, &mapping->i_pages, index); + void *item; + + VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); + item = xas_load(&xas); + if (item != expected) + return -ENOENT; + xas_store(&xas, replacement); + return 0; +} + +/* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); +} + +/* + * Definitions for "huge tmpfs": tmpfs mounted with the huge= option + * + * SHMEM_HUGE_NEVER: + * disables huge pages for the mount; + * SHMEM_HUGE_ALWAYS: + * enables huge pages for the mount; + * SHMEM_HUGE_WITHIN_SIZE: + * only allocate huge pages if the page will be fully within i_size, + * also respect fadvise()/madvise() hints; + * SHMEM_HUGE_ADVISE: + * only allocate huge pages if requested with fadvise()/madvise(); + */ + +#define SHMEM_HUGE_NEVER 0 +#define SHMEM_HUGE_ALWAYS 1 +#define SHMEM_HUGE_WITHIN_SIZE 2 +#define SHMEM_HUGE_ADVISE 3 + +/* + * Special values. + * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: + * + * SHMEM_HUGE_DENY: + * disables huge on shm_mnt and all mounts, for emergency use; + * SHMEM_HUGE_FORCE: + * enables huge on shm_mnt and all mounts, w/o needing option, for testing; + * + */ +#define SHMEM_HUGE_DENY (-1) +#define SHMEM_HUGE_FORCE (-2) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* ifdef here to avoid bloating shmem.o when not necessary */ + +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; + +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) +{ + loff_t i_size; + + if (!S_ISREG(inode->i_mode)) + return false; + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + return false; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE) + return true; + + switch (SHMEM_SB(inode->i_sb)->huge) { + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + index = round_up(index + 1, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= index) + return true; + fallthrough; + case SHMEM_HUGE_ADVISE: + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; + default: + return false; + } +} + +#if defined(CONFIG_SYSFS) +static int shmem_parse_huge(const char *str) +{ + if (!strcmp(str, "never")) + return SHMEM_HUGE_NEVER; + if (!strcmp(str, "always")) + return SHMEM_HUGE_ALWAYS; + if (!strcmp(str, "within_size")) + return SHMEM_HUGE_WITHIN_SIZE; + if (!strcmp(str, "advise")) + return SHMEM_HUGE_ADVISE; + if (!strcmp(str, "deny")) + return SHMEM_HUGE_DENY; + if (!strcmp(str, "force")) + return SHMEM_HUGE_FORCE; + return -EINVAL; +} +#endif + +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) +static const char *shmem_format_huge(int huge) +{ + switch (huge) { + case SHMEM_HUGE_NEVER: + return "never"; + case SHMEM_HUGE_ALWAYS: + return "always"; + case SHMEM_HUGE_WITHIN_SIZE: + return "within_size"; + case SHMEM_HUGE_ADVISE: + return "advise"; + case SHMEM_HUGE_DENY: + return "deny"; + case SHMEM_HUGE_FORCE: + return "force"; + default: + VM_BUG_ON(1); + return "bad_val"; + } +} +#endif + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) +{ + LIST_HEAD(list), *pos, *next; + LIST_HEAD(to_remove); + struct inode *inode; + struct shmem_inode_info *info; + struct folio *folio; + unsigned long batch = sc ? sc->nr_to_scan : 128; + int split = 0; + + if (list_empty(&sbinfo->shrinklist)) + return SHRINK_STOP; + + spin_lock(&sbinfo->shrinklist_lock); + list_for_each_safe(pos, next, &sbinfo->shrinklist) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + + /* pin the inode */ + inode = igrab(&info->vfs_inode); + + /* inode is about to be evicted */ + if (!inode) { + list_del_init(&info->shrinklist); + goto next; + } + + /* Check if there's anything to gain */ + if (round_up(inode->i_size, PAGE_SIZE) == + round_up(inode->i_size, HPAGE_PMD_SIZE)) { + list_move(&info->shrinklist, &to_remove); + goto next; + } + + list_move(&info->shrinklist, &list); +next: + sbinfo->shrinklist_len--; + if (!--batch) + break; + } + spin_unlock(&sbinfo->shrinklist_lock); + + list_for_each_safe(pos, next, &to_remove) { + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + list_del_init(&info->shrinklist); + iput(inode); + } + + list_for_each_safe(pos, next, &list) { + int ret; + pgoff_t index; + + info = list_entry(pos, struct shmem_inode_info, shrinklist); + inode = &info->vfs_inode; + + if (nr_to_split && split >= nr_to_split) + goto move_back; + + index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; + folio = filemap_get_folio(inode->i_mapping, index); + if (!folio) + goto drop; + + /* No huge page at the end of the file: nothing to split */ + if (!folio_test_large(folio)) { + folio_put(folio); + goto drop; + } + + /* + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. + * + * Waiting for the lock may lead to deadlock in the + * reclaim path. + */ + if (!folio_trylock(folio)) { + folio_put(folio); + goto move_back; + } + + ret = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + + /* If split failed move the inode on the list back to shrinklist */ + if (ret) + goto move_back; + + split++; +drop: + list_del_init(&info->shrinklist); + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: + iput(inode); + } + + return split; +} + +static long shmem_unused_huge_scan(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + if (!READ_ONCE(sbinfo->shrinklist_len)) + return SHRINK_STOP; + + return shmem_unused_huge_shrink(sbinfo, sc, 0); +} + +static long shmem_unused_huge_count(struct super_block *sb, + struct shrink_control *sc) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + return READ_ONCE(sbinfo->shrinklist_len); +} +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + +#define shmem_huge SHMEM_HUGE_DENY + +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) +{ + return false; +} + +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +/* + * Like filemap_add_folio, but error if expected item has gone. + */ +static int shmem_add_to_page_cache(struct folio *folio, + struct address_space *mapping, + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) +{ + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); + long nr = folio_nr_pages(folio); + int error; + + VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + VM_BUG_ON(expected && folio_test_large(folio)); + + folio_ref_add(folio, nr); + folio->mapping = mapping; + folio->index = index; + + if (!folio_test_swapcache(folio)) { + error = mem_cgroup_charge(folio, charge_mm, gfp); + if (error) { + if (folio_test_pmd_mappable(folio)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + } + folio_throttle_swaprate(folio, gfp); + + do { + xas_lock_irq(&xas); + if (expected != xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + if (expected && xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + xas_store(&xas, folio); + if (xas_error(&xas)) + goto unlock; + if (folio_test_pmd_mappable(folio)) { + count_vm_event(THP_FILE_ALLOC); + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); + } + mapping->nrpages += nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { + error = xas_error(&xas); + goto error; + } + + return 0; +error: + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return error; +} + +/* + * Like delete_from_page_cache, but substitutes swap for @folio. + */ +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) +{ + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); + int error; + + xa_lock_irq(&mapping->i_pages); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + xa_unlock_irq(&mapping->i_pages); + folio_put(folio); + BUG_ON(error); +} + +/* + * Remove swap entry from page cache, free the swap and its page cache. + */ +static int shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) +{ + void *old; + + old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; +} + +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given offsets are swapped out. + * + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; + unsigned long swapped = 0; + unsigned long max = end - 1; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + swapped++; + if (xas.xa_index == max) + break; + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + + rcu_read_unlock(); + + return swapped << PAGE_SHIFT; +} + +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + /* Here comes the more involved part */ + return shmem_partial_swap_usage(mapping, vma->vm_pgoff, + vma->vm_pgoff + vma_pages(vma)); +} + +/* + * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. + */ +void shmem_unlock_mapping(struct address_space *mapping) +{ + struct folio_batch fbatch; + pgoff_t index = 0; + + folio_batch_init(&fbatch); + /* + * Minor point, but we might as well stop if someone else SHM_LOCKs it. + */ + while (!mapping_unevictable(mapping) && + filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { + check_move_unevictable_folios(&fbatch); + folio_batch_release(&fbatch); + cond_resched(); + } +} + +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) +{ + struct folio *folio; + + /* + * At first avoid shmem_get_folio(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated pages as holes. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_ENTRY | FGP_LOCK, 0); + if (!xa_is_value(folio)) + return folio; + /* + * But read a page back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + folio = NULL; + shmem_get_folio(inode, index, &folio, SGP_READ); + return folio; +} + +/* + * Remove range of pages and swap entries from page cache, and free them. + * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. + */ +static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, + bool unfalloc) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t end = (lend + 1) >> PAGE_SHIFT; + struct folio_batch fbatch; + pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; + long nr_swaps_freed = 0; + pgoff_t index; + int i; + + if (lend == -1) + end = -1; /* unsigned, so actually very big */ + + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; + + folio_batch_init(&fbatch); + index = start; + while (index < end && find_lock_entries(mapping, index, end - 1, + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + + index = indices[i]; + + if (xa_is_value(folio)) { + if (unfalloc) + continue; + nr_swaps_freed += !shmem_free_swap(mapping, + index, folio); + continue; + } + index += folio_nr_pages(folio) - 1; + + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); + } + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); + cond_resched(); + index++; + } + + /* + * When undoing a failed fallocate, we want none of the partial folio + * zeroing and splitting below, but shall want to truncate the whole + * folio when !uptodate indicates that it was added by this fallocate, + * even when [lstart, lend] covers only a part of the folio. + */ + if (unfalloc) + goto whole_folios; + + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; + } + folio_unlock(folio); + folio_put(folio); + folio = NULL; + } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); + } + +whole_folios: + + index = start; + while (index < end) { + cond_resched(); + + if (!find_get_entries(mapping, index, end - 1, &fbatch, + indices)) { + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) + break; + /* But if truncating, restart to make sure all gone */ + index = start; + continue; + } + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; + + index = indices[i]; + if (xa_is_value(folio)) { + if (unfalloc) + continue; + if (shmem_free_swap(mapping, index, folio)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; + continue; + } + + folio_lock(folio); + + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { + /* Page was replaced by swap: retry */ + folio_unlock(folio); + index--; + break; + } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + + if (!folio_test_large(folio)) { + truncate_inode_folio(mapping, folio); + } else if (truncate_inode_partial_folio(folio, lstart, lend)) { + /* + * If we split a page, reset the loop so + * that we pick up the new sub pages. + * Otherwise the THP was entirely + * dropped or the target range was + * zeroed, so just continue the loop as + * is. + */ + if (!folio_test_large(folio)) { + folio_unlock(folio); + index = start; + break; + } + } + } + index = folio->index + folio_nr_pages(folio) - 1; + folio_unlock(folio); + } + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); + index++; + } + + spin_lock_irq(&info->lock); + info->swapped -= nr_swaps_freed; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); +} + +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + shmem_undo_range(inode, lstart, lend, false); + inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); +} +EXPORT_SYMBOL_GPL(shmem_truncate_range); + +static int shmem_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *inode = path->dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + + if (info->alloced - info->swapped != inode->i_mapping->nrpages) { + spin_lock_irq(&info->lock); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + } + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(&init_user_ns, inode, stat); + + if (shmem_is_huge(NULL, inode, 0, false)) + stat->blksize = HPAGE_PMD_SIZE; + + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = info->i_crtime.tv_sec; + stat->btime.tv_nsec = info->i_crtime.tv_nsec; + } + + return 0; +} + +static int shmem_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + bool update_mtime = false; + bool update_ctime = true; + + error = setattr_prepare(&init_user_ns, dentry, attr); + if (error) + return error; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + + /* protected by i_rwsem */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + + if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; + i_size_write(inode, newsize); + update_mtime = true; + } else { + update_ctime = false; + } + if (newsize <= oldsize) { + loff_t holebegin = round_up(newsize, PAGE_SIZE); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); + if (info->alloced) + shmem_truncate_range(inode, + newsize, (loff_t)-1); + /* unmap again to remove racily COWed private pages */ + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); + } + } + + setattr_copy(&init_user_ns, inode, attr); + if (attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } + return error; +} + +static void shmem_evict_inode(struct inode *inode) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (shmem_mapping(inode->i_mapping)) { + shmem_unacct_size(info->flags, inode->i_size); + inode->i_size = 0; + mapping_set_exiting(inode->i_mapping); + shmem_truncate_range(inode, 0, (loff_t)-1); + if (!list_empty(&info->shrinklist)) { + spin_lock(&sbinfo->shrinklist_lock); + if (!list_empty(&info->shrinklist)) { + list_del_init(&info->shrinklist); + sbinfo->shrinklist_len--; + } + spin_unlock(&sbinfo->shrinklist_lock); + } + while (!list_empty(&info->swaplist)) { + /* Wait while shmem_unuse() is scanning this inode... */ + wait_var_event(&info->stop_eviction, + !atomic_read(&info->stop_eviction)); + mutex_lock(&shmem_swaplist_mutex); + /* ...but beware of the race if we peeked too early */ + if (!atomic_read(&info->stop_eviction)) + list_del_init(&info->swaplist); + mutex_unlock(&shmem_swaplist_mutex); + } + } + + simple_xattrs_free(&info->xattrs); + WARN_ON(inode->i_blocks); + shmem_free_inode(inode->i_sb); + clear_inode(inode); +} + +static int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) +{ + XA_STATE(xas, &mapping->i_pages, start); + struct folio *folio; + swp_entry_t entry; + + rcu_read_lock(); + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) + continue; + + if (!xa_is_value(folio)) + continue; + + entry = radix_to_swp_entry(folio); + /* + * swapin error entries can be found in the mapping. But they're + * deliberately ignored here as we've done everything we can do. + */ + if (swp_type(entry) != type) + continue; + + indices[folio_batch_count(fbatch)] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) + break; + + if (need_resched()) { + xas_pause(&xas); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + + return xas.xa_index; +} + +/* + * Move the swapped pages for an inode to page cache. Returns the count + * of pages swapped in, or the error in case of failure. + */ +static int shmem_unuse_swap_entries(struct inode *inode, + struct folio_batch *fbatch, pgoff_t *indices) +{ + int i = 0; + int ret = 0; + int error = 0; + struct address_space *mapping = inode->i_mapping; + + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + + if (!xa_is_value(folio)) + continue; + error = shmem_swapin_folio(inode, indices[i], + &folio, SGP_CACHE, + mapping_gfp_mask(mapping), + NULL, NULL); + if (error == 0) { + folio_unlock(folio); + folio_put(folio); + ret++; + } + if (error == -ENOMEM) + break; + error = 0; + } + return error ? error : ret; +} + +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct inode *inode, unsigned int type) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start = 0; + struct folio_batch fbatch; + pgoff_t indices[PAGEVEC_SIZE]; + int ret = 0; + + do { + folio_batch_init(&fbatch); + shmem_find_swap_entries(mapping, start, &fbatch, indices, type); + if (folio_batch_count(&fbatch) == 0) { + ret = 0; + break; + } + + ret = shmem_unuse_swap_entries(inode, &fbatch, indices); + if (ret < 0) + break; + + start = indices[folio_batch_count(&fbatch) - 1]; + } while (true); + + return ret; +} + +/* + * Read all the shared memory data that resides in the swap + * device 'type' back into memory, so the swap device can be + * unused. + */ +int shmem_unuse(unsigned int type) +{ + struct shmem_inode_info *info, *next; + int error = 0; + + if (list_empty(&shmem_swaplist)) + return 0; + + mutex_lock(&shmem_swaplist_mutex); + list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { + if (!info->swapped) { + list_del_init(&info->swaplist); + continue; + } + /* + * Drop the swaplist mutex while searching the inode for swap; + * but before doing so, make sure shmem_evict_inode() will not + * remove placeholder inode from swaplist, nor let it be freed + * (igrab() would protect from unlink, but not from unmount). + */ + atomic_inc(&info->stop_eviction); + mutex_unlock(&shmem_swaplist_mutex); + + error = shmem_unuse_inode(&info->vfs_inode, type); + cond_resched(); + + mutex_lock(&shmem_swaplist_mutex); + next = list_next_entry(info, swaplist); + if (!info->swapped) + list_del_init(&info->swaplist); + if (atomic_dec_and_test(&info->stop_eviction)) + wake_up_var(&info->stop_eviction); + if (error) + break; + } + mutex_unlock(&shmem_swaplist_mutex); + + return error; +} + +/* + * Move the page from the page cache to the swap cache. + */ +static int shmem_writepage(struct page *page, struct writeback_control *wbc) +{ + struct folio *folio = page_folio(page); + struct shmem_inode_info *info; + struct address_space *mapping; + struct inode *inode; + swp_entry_t swap; + pgoff_t index; + + /* + * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or + * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, + * and its shmem_writeback() needs them to be split when swapping. + */ + if (folio_test_large(folio)) { + /* Ensure the subpages are still dirty */ + folio_test_set_dirty(folio); + if (split_huge_page(page) < 0) + goto redirty; + folio = page_folio(page); + folio_clear_dirty(folio); + } + + BUG_ON(!folio_test_locked(folio)); + mapping = folio->mapping; + index = folio->index; + inode = mapping->host; + info = SHMEM_I(inode); + if (info->flags & VM_LOCKED) + goto redirty; + if (!total_swap_pages) + goto redirty; + + /* + * Our capabilities prevent regular writeback or sync from ever calling + * shmem_writepage; but a stacking filesystem might use ->writepage of + * its underlying filesystem, in which case tmpfs should write out to + * swap only in response to memory pressure, and not for the writeback + * threads or sync. + */ + if (!wbc->for_reclaim) { + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ + goto redirty; + } + + /* + * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC + * value into swapfile.c, the only way we can correctly account for a + * fallocated folio arriving here is now to initialize it and write it. + * + * That's okay for a folio already fallocated earlier, but if we have + * not yet completed the fallocation, then (a) we want to keep track + * of this folio in case we have to undo it, and (b) it may not be a + * good idea to continue anyway, once we're pushing into swap. So + * reactivate the folio, and let shmem_fallocate() quit when too many. + */ + if (!folio_test_uptodate(folio)) { + if (inode->i_private) { + struct shmem_falloc *shmem_falloc; + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + !shmem_falloc->waitq && + index >= shmem_falloc->start && + index < shmem_falloc->next) + shmem_falloc->nr_unswapped++; + else + shmem_falloc = NULL; + spin_unlock(&inode->i_lock); + if (shmem_falloc) + goto redirty; + } + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + swap = folio_alloc_swap(folio); + if (!swap.val) + goto redirty; + + /* + * Add inode to shmem_unuse()'s list of swapped-out inodes, + * if it's not already there. Do it now before the folio is + * moved to swap cache, when its pagelock no longer protects + * the inode from eviction. But don't unlock the mutex until + * we've incremented swapped, because shmem_unuse_inode() will + * prune a !swapped inode from the swaplist under this mutex. + */ + mutex_lock(&shmem_swaplist_mutex); + if (list_empty(&info->swaplist)) + list_add(&info->swaplist, &shmem_swaplist); + + if (add_to_swap_cache(folio, swap, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, + NULL) == 0) { + spin_lock_irq(&info->lock); + shmem_recalc_inode(inode); + info->swapped++; + spin_unlock_irq(&info->lock); + + swap_shmem_alloc(swap); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); + + mutex_unlock(&shmem_swaplist_mutex); + BUG_ON(folio_mapped(folio)); + swap_writepage(&folio->page, wbc); + return 0; + } + + mutex_unlock(&shmem_swaplist_mutex); + put_swap_folio(folio, swap); +redirty: + folio_mark_dirty(folio); + if (wbc->for_reclaim) + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ + folio_unlock(folio); + return 0; +} + +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) +static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ + char buffer[64]; + + if (!mpol || mpol->mode == MPOL_DEFAULT) + return; /* show nothing */ + + mpol_to_str(buffer, sizeof(buffer), mpol); + + seq_printf(seq, ",mpol=%s", buffer); +} + +static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + struct mempolicy *mpol = NULL; + if (sbinfo->mpol) { + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + mpol = sbinfo->mpol; + mpol_get(mpol); + raw_spin_unlock(&sbinfo->stat_lock); + } + return mpol; +} +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ +#ifndef CONFIG_NUMA +#define vm_policy vm_private_data +#endif + +static void shmem_pseudo_vma_init(struct vm_area_struct *vma, + struct shmem_inode_info *info, pgoff_t index) +{ + /* Create a pseudo vma that just contains the policy */ + vma_init(vma, NULL); + /* Bias interleave by inode number to distribute better across nodes */ + vma->vm_pgoff = index + info->vfs_inode.i_ino; + vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); +} + +static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) +{ + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(vma->vm_policy); +} + +static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct page *page; + struct vm_fault vmf = { + .vma = &pvma, + }; + + shmem_pseudo_vma_init(&pvma, info, index); + page = swap_cluster_readahead(swap, gfp, &vmf); + shmem_pseudo_vma_destroy(&pvma); + + if (!page) + return NULL; + return page_folio(page); +} + +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; +} + +static struct folio *shmem_alloc_hugefolio(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct address_space *mapping = info->vfs_inode.i_mapping; + pgoff_t hindex; + struct folio *folio; + + hindex = round_down(index, HPAGE_PMD_NR); + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, + XA_PRESENT)) + return NULL; + + shmem_pseudo_vma_init(&pvma, info, hindex); + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); + shmem_pseudo_vma_destroy(&pvma); + if (!folio) + count_vm_event(THP_FILE_FALLBACK); + return folio; +} + +static struct folio *shmem_alloc_folio(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct vm_area_struct pvma; + struct folio *folio; + + shmem_pseudo_vma_init(&pvma, info, index); + folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); + shmem_pseudo_vma_destroy(&pvma); + + return folio; +} + +static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, + pgoff_t index, bool huge) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct folio *folio; + int nr; + int err = -ENOSPC; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + huge = false; + nr = huge ? HPAGE_PMD_NR : 1; + + if (!shmem_inode_acct_block(inode, nr)) + goto failed; + + if (huge) + folio = shmem_alloc_hugefolio(gfp, info, index); + else + folio = shmem_alloc_folio(gfp, info, index); + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + return folio; + } + + err = -ENOMEM; + shmem_inode_unacct_blocks(inode, nr); +failed: + return ERR_PTR(err); +} + +/* + * When a page is moved from swapcache to shmem filecache (either by the + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of + * shmem_unuse_inode()), it may have been read in earlier from swap, in + * ignorance of the mapping it belongs to. If that mapping has special + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), + * we may need to copy to a suitable page before moving to filecache. + * + * In a future release, this may well be extended to respect cpuset and + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); + * but for now it is a simple matter of zone. + */ +static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) +{ + return folio_zonenum(folio) > gfp_zone(gfp); +} + +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + struct folio *old, *new; + struct address_space *swap_mapping; + swp_entry_t entry; + pgoff_t swap_index; + int error; + + old = *foliop; + entry = folio_swap_entry(old); + swap_index = swp_offset(entry); + swap_mapping = swap_address_space(entry); + + /* + * We have arrived here because our zones are constrained, so don't + * limit chance of success by further cpuset and node constraints. + */ + gfp &= ~GFP_CONSTRAINT_MASK; + VM_BUG_ON_FOLIO(folio_test_large(old), old); + new = shmem_alloc_folio(gfp, info, index); + if (!new) + return -ENOMEM; + + folio_get(new); + folio_copy(new, old); + flush_dcache_folio(new); + + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + folio_set_swap_entry(new, entry); + folio_set_swapcache(new); + + /* + * Our caller will very soon move newpage out of swapcache, but it's + * a nice clean interface for us to replace oldpage by newpage there. + */ + xa_lock_irq(&swap_mapping->i_pages); + error = shmem_replace_entry(swap_mapping, swap_index, old, new); + if (!error) { + mem_cgroup_migrate(old, new); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); + __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); + __lruvec_stat_mod_folio(old, NR_SHMEM, -1); + } + xa_unlock_irq(&swap_mapping->i_pages); + + if (unlikely(error)) { + /* + * Is this possible? I think not, now that our callers check + * both PageSwapCache and page_private after getting page lock; + * but be defensive. Reverse old to newpage for clear and free. + */ + old = new; + } else { + folio_add_lru(new); + *foliop = new; + } + + folio_clear_swapcache(old); + old->private = NULL; + + folio_unlock(old); + folio_put_refs(old, 2); + return error; +} + +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, + struct folio *folio, swp_entry_t swap) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swapin_error; + void *old; + + swapin_error = make_swapin_error_entry(&folio->page); + old = xa_cmpxchg_irq(&mapping->i_pages, index, + swp_to_radix_entry(swap), + swp_to_radix_entry(swapin_error), 0); + if (old != swp_to_radix_entry(swap)) + return; + + folio_wait_writeback(folio); + delete_from_swap_cache(folio); + spin_lock_irq(&info->lock); + /* + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't + * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in + * shmem_evict_inode. + */ + info->alloced--; + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + swap_free(swap); +} + +/* + * Swap in the folio pointed to by *foliop. + * Caller has to make sure that *foliop contains a valid swapped folio. + * Returns 0 and the folio in foliop if success. On failure, returns the + * error code and NULL in *foliop. + */ +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; + struct folio *folio = NULL; + swp_entry_t swap; + int error; + + VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); + swap = radix_to_swp_entry(*foliop); + *foliop = NULL; + + if (is_swapin_error_entry(swap)) + return -EIO; + + /* Look it up and read it in.. */ + folio = swap_cache_get_folio(swap, NULL, 0); + if (!folio) { + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { + *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); + } + /* Here we actually start the io */ + folio = shmem_swapin(swap, gfp, info, index); + if (!folio) { + error = -ENOMEM; + goto failed; + } + } + + /* We have to do this with folio locked to prevent races */ + folio_lock(folio); + if (!folio_test_swapcache(folio) || + folio_swap_entry(folio).val != swap.val || + !shmem_confirm_swap(mapping, index, swap)) { + error = -EEXIST; + goto unlock; + } + if (!folio_test_uptodate(folio)) { + error = -EIO; + goto failed; + } + folio_wait_writeback(folio); + + /* + * Some architectures may have to restore extra metadata to the + * folio after reading from swap. + */ + arch_swap_restore(swap, folio); + + if (shmem_should_replace_folio(folio, gfp)) { + error = shmem_replace_folio(&folio, gfp, info, index); + if (error) + goto failed; + } + + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); + if (error) + goto failed; + + spin_lock_irq(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + + if (sgp == SGP_WRITE) + folio_mark_accessed(folio); + + delete_from_swap_cache(folio); + folio_mark_dirty(folio); + swap_free(swap); + + *foliop = folio; + return 0; +failed: + if (!shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; + if (error == -EIO) + shmem_set_folio_swapin_error(inode, index, folio, swap); +unlock: + if (folio) { + folio_unlock(folio); + folio_put(folio); + } + + return error; +} + +/* + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate + * + * If we allocate a new one we do not mark it dirty. That's up to the + * vm. If we swap it in we mark it dirty since we also free the swap + * entry since a page cannot live in both the swap and page cache. + * + * vma, vmf, and fault_type are only supplied by shmem_fault: + * otherwise they are NULL. + */ +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo; + struct mm_struct *charge_mm; + struct folio *folio; + pgoff_t hindex = index; + gfp_t huge_gfp; + int error; + int once = 0; + int alloced = 0; + + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) + return -EFBIG; +repeat: + if (sgp <= SGP_CACHE && + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + return -EINVAL; + } + + sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = vma ? vma->vm_mm : NULL; + + folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0); + if (folio && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(folio)) { + folio_unlock(folio); + folio_put(folio); + } + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + + if (xa_is_value(folio)) { + error = shmem_swapin_folio(inode, index, &folio, + sgp, gfp, vma, fault_type); + if (error == -EEXIST) + goto repeat; + + *foliop = folio; + return error; + } + + if (folio) { + hindex = folio->index; + if (sgp == SGP_WRITE) + folio_mark_accessed(folio); + if (folio_test_uptodate(folio)) + goto out; + /* fallocated folio */ + if (sgp != SGP_READ) + goto clear; + folio_unlock(folio); + folio_put(folio); + } + + /* + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. + */ + *foliop = NULL; + if (sgp == SGP_READ) + return 0; + if (sgp == SGP_NOALLOC) + return -ENOENT; + + /* + * Fast cache lookup and swap lookup did not find it: allocate. + */ + + if (vma && userfaultfd_missing(vma)) { + *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); + return 0; + } + + if (!shmem_is_huge(vma, inode, index, false)) + goto alloc_nohuge; + + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); + if (IS_ERR(folio)) { +alloc_nohuge: + folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + } + if (IS_ERR(folio)) { + int retry = 5; + + error = PTR_ERR(folio); + folio = NULL; + if (error != -ENOSPC) + goto unlock; + /* + * Try to reclaim some space by splitting a large folio + * beyond i_size on the filesystem. + */ + while (retry--) { + int ret; + + ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); + if (ret == SHRINK_STOP) + break; + if (ret) + goto alloc_nohuge; + } + goto unlock; + } + + hindex = round_down(index, folio_nr_pages(folio)); + + if (sgp == SGP_WRITE) + __folio_set_referenced(folio); + + error = shmem_add_to_page_cache(folio, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); + if (error) + goto unacct; + folio_add_lru(folio); + + spin_lock_irq(&info->lock); + info->alloced += folio_nr_pages(folio); + inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + alloced = true; + + if (folio_test_pmd_mappable(folio) && + DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < + folio_next_index(folio) - 1) { + /* + * Part of the large folio is beyond i_size: subject + * to shrink under memory pressure. + */ + spin_lock(&sbinfo->shrinklist_lock); + /* + * _careful to defend against unlocked access to + * ->shrink_list in shmem_unused_huge_shrink() + */ + if (list_empty_careful(&info->shrinklist)) { + list_add_tail(&info->shrinklist, + &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + } + spin_unlock(&sbinfo->shrinklist_lock); + } + + /* + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. + */ + if (sgp == SGP_FALLOC) + sgp = SGP_WRITE; +clear: + /* + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize + * it now, lest undo on failure cancel our earlier guarantee. + */ + if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { + long i, n = folio_nr_pages(folio); + + for (i = 0; i < n; i++) + clear_highpage(folio_page(folio, i)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); + } + + /* Perhaps the file has been truncated since we checked */ + if (sgp <= SGP_CACHE && + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + if (alloced) { + folio_clear_dirty(folio); + filemap_remove_folio(folio); + spin_lock_irq(&info->lock); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + } + error = -EINVAL; + goto unlock; + } +out: + *foliop = folio; + return 0; + + /* + * Error recovery. + */ +unacct: + shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); + + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); + goto alloc_nohuge; + } +unlock: + if (folio) { + folio_unlock(folio); + folio_put(folio); + } + if (error == -ENOSPC && !once++) { + spin_lock_irq(&info->lock); + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + goto repeat; + } + if (error == -EEXIST) + goto repeat; + return error; +} + +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + +/* + * This is like autoremove_wake_function, but it removes the wait queue + * entry unconditionally - even if something else had already woken the + * target. + */ +static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + list_del_init(&wait->entry); + return ret; +} + +static vm_fault_t shmem_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; + int err; + vm_fault_t ret = VM_FAULT_LOCKED; + + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + struct file *fpin; + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + if (fpin) + ret = VM_FAULT_RETRY; + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + + if (fpin) + fput(fpin); + return ret; + } + spin_unlock(&inode->i_lock); + } + + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, + gfp, vma, vmf, &ret); + if (err) + return vmf_error(err); + if (folio) + vmf->page = folio_file_page(folio, vmf->pgoff); + return ret; +} + +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long uaddr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long (*get_area)(struct file *, + unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long addr; + unsigned long offset; + unsigned long inflated_len; + unsigned long inflated_addr; + unsigned long inflated_offset; + + if (len > TASK_SIZE) + return -ENOMEM; + + get_area = current->mm->get_unmapped_area; + addr = get_area(file, uaddr, len, pgoff, flags); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return addr; + if (IS_ERR_VALUE(addr)) + return addr; + if (addr & ~PAGE_MASK) + return addr; + if (addr > TASK_SIZE - len) + return addr; + + if (shmem_huge == SHMEM_HUGE_DENY) + return addr; + if (len < HPAGE_PMD_SIZE) + return addr; + if (flags & MAP_FIXED) + return addr; + /* + * Our priority is to support MAP_SHARED mapped hugely; + * and support MAP_PRIVATE mapped hugely too, until it is COWed. + * But if caller specified an address hint and we allocated area there + * successfully, respect that as before. + */ + if (uaddr == addr) + return addr; + + if (shmem_huge != SHMEM_HUGE_FORCE) { + struct super_block *sb; + + if (file) { + VM_BUG_ON(file->f_op != &shmem_file_operations); + sb = file_inode(file)->i_sb; + } else { + /* + * Called directly from mm/mmap.c, or drivers/char/mem.c + * for "/dev/zero", to create a shared anonymous object. + */ + if (IS_ERR(shm_mnt)) + return addr; + sb = shm_mnt->mnt_sb; + } + if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER) + return addr; + } + + offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); + if (offset && offset + len < 2 * HPAGE_PMD_SIZE) + return addr; + if ((addr & (HPAGE_PMD_SIZE-1)) == offset) + return addr; + + inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; + if (inflated_len > TASK_SIZE) + return addr; + if (inflated_len < len) + return addr; + + inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + if (IS_ERR_VALUE(inflated_addr)) + return addr; + if (inflated_addr & ~PAGE_MASK) + return addr; + + inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); + inflated_addr += offset - inflated_offset; + if (inflated_offset > offset) + inflated_addr += HPAGE_PMD_SIZE; + + if (inflated_addr > TASK_SIZE - len) + return addr; + return inflated_addr; +} + +#ifdef CONFIG_NUMA +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode = file_inode(vma->vm_file); + pgoff_t index; + + index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); +} +#endif + +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int retval = -ENOMEM; + + /* + * What serializes the accesses to info->flags? + * ipc_lock_object() when called from shmctl_do_lock(), + * no serialization needed when called from shm_destroy(). + */ + if (lock && !(info->flags & VM_LOCKED)) { + if (!user_shm_lock(inode->i_size, ucounts)) + goto out_nomem; + info->flags |= VM_LOCKED; + mapping_set_unevictable(file->f_mapping); + } + if (!lock && (info->flags & VM_LOCKED) && ucounts) { + user_shm_unlock(inode->i_size, ucounts); + info->flags &= ~VM_LOCKED; + mapping_clear_unevictable(file->f_mapping); + } + retval = 0; + +out_nomem: + return retval; +} + +static int shmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; + + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; + + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; + + file_accessed(file); + vma->vm_ops = &shmem_vm_ops; + return 0; +} + +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); + +/* + * chattr's fsflags are unrelated to extended attributes, + * but tmpfs has chosen to enable them under the same config option. + */ +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ + unsigned int i_flags = 0; + + if (fsflags & FS_NOATIME_FL) + i_flags |= S_NOATIME; + if (fsflags & FS_APPEND_FL) + i_flags |= S_APPEND; + if (fsflags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + /* + * But FS_NODUMP_FL does not require any action in i_flags. + */ + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); +} +#else +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ +} +#define shmem_initxattrs NULL +#endif + +static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + struct inode *inode; + struct shmem_inode_info *info; + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + ino_t ino; + + if (shmem_reserve_inode(sb, &ino)) + return NULL; + + inode = new_inode(sb); + if (inode) { + inode->i_ino = ino; + inode_init_owner(&init_user_ns, inode, dir, mode); + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_generation = get_random_u32(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); + info->seals = F_SEAL_SEAL; + info->flags = flags & VM_NORESERVE; + info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); + INIT_LIST_HEAD(&info->shrinklist); + INIT_LIST_HEAD(&info->swaplist); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); + mapping_set_large_folios(inode->i_mapping); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + + lockdep_annotate_inode_mutex_key(inode); + } else + shmem_free_inode(sb); + return inode; +} + +#ifdef CONFIG_USERFAULTFD +int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, bool wp_copy, + struct page **pagep) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + gfp_t gfp = mapping_gfp_mask(mapping); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + void *page_kaddr; + struct folio *folio; + int ret; + pgoff_t max_off; + + if (!shmem_inode_acct_block(inode, 1)) { + /* + * We may have got a page, returned -ENOENT triggering a retry, + * and now we find ourselves with -ENOMEM. Release the page, to + * avoid a BUG_ON in our caller. + */ + if (unlikely(*pagep)) { + put_page(*pagep); + *pagep = NULL; + } + return -ENOMEM; + } + + if (!*pagep) { + ret = -ENOMEM; + folio = shmem_alloc_folio(gfp, info, pgoff); + if (!folio) + goto out_unacct_blocks; + + if (!zeropage) { /* COPY */ + page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret = copy_from_user(page_kaddr, + (const void __user *)src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(page_kaddr); + + /* fallback to copy_from_user outside mmap_lock */ + if (unlikely(ret)) { + *pagep = &folio->page; + ret = -ENOENT; + /* don't free the page */ + goto out_unacct_blocks; + } + + flush_dcache_folio(folio); + } else { /* ZEROPAGE */ + clear_user_highpage(&folio->page, dst_addr); + } + } else { + folio = page_folio(*pagep); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + *pagep = NULL; + } + + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); + + ret = -EFAULT; + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(pgoff >= max_off)) + goto out_release; + + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); + if (ret) + goto out_release; + + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + &folio->page, true, wp_copy); + if (ret) + goto out_delete_from_cache; + + spin_lock_irq(&info->lock); + info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + + folio_unlock(folio); + return 0; +out_delete_from_cache: + filemap_remove_folio(folio); +out_release: + folio_unlock(folio); + folio_put(folio); +out_unacct_blocks: + shmem_inode_unacct_blocks(inode, 1); + return ret; +} +#endif /* CONFIG_USERFAULTFD */ + +#ifdef CONFIG_TMPFS +static const struct inode_operations shmem_symlink_inode_operations; +static const struct inode_operations shmem_short_symlink_operations; + +static int +shmem_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); + pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; + + /* i_rwsem is held by caller */ + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + + if (ret) + return ret; + + *pagep = folio_file_page(folio, index); + if (PageHWPoison(*pagep)) { + folio_unlock(folio); + folio_put(folio); + *pagep = NULL; + return -EIO; + } + + return 0; +} + +static int +shmem_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + + if (pos + copied > inode->i_size) + i_size_write(inode, pos + copied); + + if (!PageUptodate(page)) { + struct page *head = compound_head(page); + if (PageTransCompound(page)) { + int i; + + for (i = 0; i < HPAGE_PMD_NR; i++) { + if (head + i == page) + continue; + clear_highpage(head + i); + flush_dcache_page(head + i); + } + } + if (copied < PAGE_SIZE) { + unsigned from = pos & (PAGE_SIZE - 1); + zero_user_segments(page, 0, from, + from + copied, PAGE_SIZE); + } + SetPageUptodate(head); + } + set_page_dirty(page); + unlock_page(page); + put_page(page); + + return copied; +} + +static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + unsigned long offset; + int error = 0; + ssize_t retval = 0; + loff_t *ppos = &iocb->ki_pos; + + index = *ppos >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; + + for (;;) { + struct folio *folio = NULL; + struct page *page = NULL; + pgoff_t end_index; + unsigned long nr, ret; + loff_t i_size = i_size_read(inode); + + end_index = i_size >> PAGE_SHIFT; + if (index > end_index) + break; + if (index == end_index) { + nr = i_size & ~PAGE_MASK; + if (nr <= offset) + break; + } + + error = shmem_get_folio(inode, index, &folio, SGP_READ); + if (error) { + if (error == -EINVAL) + error = 0; + break; + } + if (folio) { + folio_unlock(folio); + + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + folio_put(folio); + error = -EIO; + break; + } + } + + /* + * We must evaluate after, since reads (unlike writes) + * are called without i_rwsem protection against truncate + */ + nr = PAGE_SIZE; + i_size = i_size_read(inode); + end_index = i_size >> PAGE_SHIFT; + if (index == end_index) { + nr = i_size & ~PAGE_MASK; + if (nr <= offset) { + if (folio) + folio_put(folio); + break; + } + } + nr -= offset; + + if (folio) { + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + ret = copy_page_to_iter(page, offset, nr, to); + folio_put(folio); + + } else if (user_backed_iter(to)) { + /* + * Copy to user tends to be so well optimized, but + * clear_user() not so much, that it is noticeably + * faster to copy the zero page instead of clearing. + */ + ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); + } else { + /* + * But submitting the same page twice in a row to + * splice() - or others? - can result in confusion: + * so don't attempt that optimization on pipes etc. + */ + ret = iov_iter_zero(nr, to); + } + + retval += ret; + offset += ret; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; + + if (!iov_iter_count(to)) + break; + if (ret < nr) { + error = -EFAULT; + break; + } + cond_resched(); + } + + *ppos = ((loff_t) index << PAGE_SHIFT) + offset; + file_accessed(file); + return retval ? retval : error; +} + +static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + + if (whence != SEEK_DATA && whence != SEEK_HOLE) + return generic_file_llseek_size(file, offset, whence, + MAX_LFS_FILESIZE, i_size_read(inode)); + if (offset < 0) + return -ENXIO; + + inode_lock(inode); + /* We're holding i_rwsem so we can access i_size directly */ + offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); + if (offset >= 0) + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); + inode_unlock(inode); + return offset; +} + +static long shmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_falloc shmem_falloc; + pgoff_t start, index, end, undo_fallocend; + int error; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + if (mode & FALLOC_FL_PUNCH_HOLE) { + struct address_space *mapping = file->f_mapping; + loff_t unmap_start = round_up(offset, PAGE_SIZE); + loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + + /* protected by i_rwsem */ + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { + error = -EPERM; + goto out; + } + + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, offset, offset + len - 1); + /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head)); + spin_unlock(&inode->i_lock); + error = 0; + goto out; + } + + /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ + error = inode_newsize_ok(inode, offset + len); + if (error) + goto out; + + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + + start = offset >> PAGE_SHIFT; + end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* Try to avoid a swapstorm if len is impossible to satisfy */ + if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { + error = -ENOSPC; + goto out; + } + + shmem_falloc.waitq = NULL; + shmem_falloc.start = start; + shmem_falloc.next = start; + shmem_falloc.nr_falloced = 0; + shmem_falloc.nr_unswapped = 0; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + + for (index = start; index < end; ) { + struct folio *folio; + + /* + * Good, the fallocate(2) manpage permits EINTR: we may have + * been interrupted because we are using up too much memory. + */ + if (signal_pending(current)) + error = -EINTR; + else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) + error = -ENOMEM; + else + error = shmem_get_folio(inode, index, &folio, + SGP_FALLOC); + if (error) { + info->fallocend = undo_fallocend; + /* Remove the !uptodate folios we added */ + if (index > start) { + shmem_undo_range(inode, + (loff_t)start << PAGE_SHIFT, + ((loff_t)index << PAGE_SHIFT) - 1, true); + } + goto undone; + } + + /* + * Here is a more important optimization than it appears: + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. + */ + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; + + /* + * Inform shmem_writepage() how far we have reached. + * No need for lock or barrier: we have the page lock. + */ + if (!folio_test_uptodate(folio)) + shmem_falloc.nr_falloced += index - shmem_falloc.next; + shmem_falloc.next = index; + + /* + * If !uptodate, leave it that way so that freeable folios + * can be recognized if we need to rollback on error later. + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios + * might still be clean: we now need to mark those dirty too). + */ + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); + cond_resched(); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) + i_size_write(inode, offset + len); +undone: + spin_lock(&inode->i_lock); + inode->i_private = NULL; + spin_unlock(&inode->i_lock); +out: + if (!error) + file_modified(file); + inode_unlock(inode); + return error; +} + +static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); + + buf->f_type = TMPFS_MAGIC; + buf->f_bsize = PAGE_SIZE; + buf->f_namelen = NAME_MAX; + if (sbinfo->max_blocks) { + buf->f_blocks = sbinfo->max_blocks; + buf->f_bavail = + buf->f_bfree = sbinfo->max_blocks - + percpu_counter_sum(&sbinfo->used_blocks); + } + if (sbinfo->max_inodes) { + buf->f_files = sbinfo->max_inodes; + buf->f_ffree = sbinfo->free_inodes; + } + /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + + return 0; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +static int +shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); + if (inode) { + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + error = security_inode_init_security(inode, dir, + &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = 0; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + } + return error; +out_iput: + iput(inode); + return error; +} + +static int +shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) +{ + struct inode *inode; + int error = -ENOSPC; + + inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE); + if (inode) { + error = security_inode_init_security(inode, dir, + NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(file, inode); + } + return finish_open_simple(file, error); +out_iput: + iput(inode); + return error; +} + +static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + int error; + + if ((error = shmem_mknod(&init_user_ns, dir, dentry, + mode | S_IFDIR, 0))) + return error; + inc_nlink(dir); + return 0; +} + +static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); +} + +/* + * Link a file.. + */ +static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(old_dentry); + int ret = 0; + + /* + * No ordinary (disk based) filesystem counts links as inodes; + * but each new link needs a new dentry, pinning lowmem, and + * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. + */ + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb, NULL); + if (ret) + goto out; + } + + dir->i_size += BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); + inc_nlink(inode); + ihold(inode); /* New dentry reference */ + dget(dentry); /* Extra pinning count for the created dentry */ + d_instantiate(dentry, inode); +out: + return ret; +} + +static int shmem_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) + shmem_free_inode(inode->i_sb); + + dir->i_size -= BOGO_DIRENT_SIZE; + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); + drop_nlink(inode); + dput(dentry); /* Undo the count from "create" - this does all the work */ + return 0; +} + +static int shmem_rmdir(struct inode *dir, struct dentry *dentry) +{ + if (!simple_empty(dentry)) + return -ENOTEMPTY; + + drop_nlink(d_inode(dentry)); + drop_nlink(dir); + return shmem_unlink(dir, dentry); +} + +static int shmem_whiteout(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(&init_user_ns, old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + +/* + * The VFS layer already does all the dentry stuff for rename, + * we just have to decrement the usage count for the target if + * it exists so that the VFS layer correctly free's it when it + * gets overwritten. + */ +static int shmem_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *inode = d_inode(old_dentry); + int they_are_dirs = S_ISDIR(inode->i_mode); + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + + if (!simple_empty(new_dentry)) + return -ENOTEMPTY; + + if (flags & RENAME_WHITEOUT) { + int error; + + error = shmem_whiteout(&init_user_ns, old_dir, old_dentry); + if (error) + return error; + } + + if (d_really_is_positive(new_dentry)) { + (void) shmem_unlink(new_dir, new_dentry); + if (they_are_dirs) { + drop_nlink(d_inode(new_dentry)); + drop_nlink(old_dir); + } + } else if (they_are_dirs) { + drop_nlink(old_dir); + inc_nlink(new_dir); + } + + old_dir->i_size -= BOGO_DIRENT_SIZE; + new_dir->i_size += BOGO_DIRENT_SIZE; + old_dir->i_ctime = old_dir->i_mtime = + new_dir->i_ctime = new_dir->i_mtime = + inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + return 0; +} + +static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + int error; + int len; + struct inode *inode; + struct folio *folio; + + len = strlen(symname) + 1; + if (len > PAGE_SIZE) + return -ENAMETOOLONG; + + inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0, + VM_NORESERVE); + if (!inode) + return -ENOSPC; + + error = security_inode_init_security(inode, dir, &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) { + iput(inode); + return error; + } + + inode->i_size = len-1; + if (len <= SHORT_SYMLINK_LEN) { + inode->i_link = kmemdup(symname, len, GFP_KERNEL); + if (!inode->i_link) { + iput(inode); + return -ENOMEM; + } + inode->i_op = &shmem_short_symlink_operations; + } else { + inode_nohighmem(inode); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); + if (error) { + iput(inode); + return error; + } + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_symlink_inode_operations; + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); + } + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); + d_instantiate(dentry, inode); + dget(dentry); + return 0; +} + +static void shmem_put_link(void *arg) +{ + folio_mark_accessed(arg); + folio_put(arg); +} + +static const char *shmem_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct folio *folio = NULL; + int error; + + if (!dentry) { + folio = filemap_get_folio(inode->i_mapping, 0); + if (!folio) + return ERR_PTR(-ECHILD); + if (PageHWPoison(folio_page(folio, 0)) || + !folio_test_uptodate(folio)) { + folio_put(folio); + return ERR_PTR(-ECHILD); + } + } else { + error = shmem_get_folio(inode, 0, &folio, SGP_READ); + if (error) + return ERR_PTR(error); + if (!folio) + return ERR_PTR(-ECHILD); + if (PageHWPoison(folio_page(folio, 0))) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-ECHILD); + } + folio_unlock(folio); + } + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); +} + +#ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + shmem_set_inode_flags(inode, info->fsflags); + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + return 0; +} + +/* + * Superblocks without xattr inode operations may get some security.* xattr + * support from the LSM "for free". As soon as we have any other xattrs + * like ACLs, we also need to implement the security.* handlers at + * filesystem level, though. + */ + +/* + * Callback for security_inode_init_security() for acquiring xattrs. + */ +static int shmem_initxattrs(struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + const struct xattr *xattr; + struct simple_xattr *new_xattr; + size_t len; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); + if (!new_xattr) + return -ENOMEM; + + len = strlen(xattr->name) + 1; + new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, + GFP_KERNEL); + if (!new_xattr->name) { + kvfree(new_xattr); + return -ENOMEM; + } + + memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); + memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, + xattr->name, len); + + simple_xattr_list_add(&info->xattrs, new_xattr); + } + + return 0; +} + +static int shmem_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + + name = xattr_full_name(handler, name); + return simple_xattr_get(&info->xattrs, name, buffer, size); +} + +static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + int err; + + name = xattr_full_name(handler, name); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; +} + +static const struct xattr_handler shmem_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; + +static const struct xattr_handler shmem_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; + +static const struct xattr_handler *shmem_xattr_handlers[] = { +#ifdef CONFIG_TMPFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &shmem_security_xattr_handler, + &shmem_trusted_xattr_handler, + NULL +}; + +static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); +} +#endif /* CONFIG_TMPFS_XATTR */ + +static const struct inode_operations shmem_short_symlink_operations = { + .getattr = shmem_getattr, + .get_link = simple_get_link, +#ifdef CONFIG_TMPFS_XATTR + .listxattr = shmem_listxattr, +#endif +}; + +static const struct inode_operations shmem_symlink_inode_operations = { + .getattr = shmem_getattr, + .get_link = shmem_get_link, +#ifdef CONFIG_TMPFS_XATTR + .listxattr = shmem_listxattr, +#endif +}; + +static struct dentry *shmem_get_parent(struct dentry *child) +{ + return ERR_PTR(-ESTALE); +} + +static int shmem_match(struct inode *ino, void *vfh) +{ + __u32 *fh = vfh; + __u64 inum = fh[2]; + inum = (inum << 32) | fh[1]; + return ino->i_ino == inum && fh[0] == ino->i_generation; +} + +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + + +static struct dentry *shmem_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + struct inode *inode; + struct dentry *dentry = NULL; + u64 inum; + + if (fh_len < 3) + return NULL; + + inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), + shmem_match, fid->raw); + if (inode) { + dentry = shmem_find_alias(inode); + iput(inode); + } + + return dentry; +} + +static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len, + struct inode *parent) +{ + if (*len < 3) { + *len = 3; + return FILEID_INVALID; + } + + if (inode_unhashed(inode)) { + /* Unfortunately insert_inode_hash is not idempotent, + * so as we hash inodes here rather than at creation + * time, we need a lock to ensure we only try + * to do it once + */ + static DEFINE_SPINLOCK(lock); + spin_lock(&lock); + if (inode_unhashed(inode)) + __insert_inode_hash(inode, + inode->i_ino + inode->i_generation); + spin_unlock(&lock); + } + + fh[0] = inode->i_generation; + fh[1] = inode->i_ino; + fh[2] = ((__u64)inode->i_ino) >> 32; + + *len = 3; + return 1; +} + +static const struct export_operations shmem_export_ops = { + .get_parent = shmem_get_parent, + .encode_fh = shmem_encode_fh, + .fh_to_dentry = shmem_fh_to_dentry, +}; + +enum shmem_param { + Opt_gid, + Opt_huge, + Opt_mode, + Opt_mpol, + Opt_nr_blocks, + Opt_nr_inodes, + Opt_size, + Opt_uid, + Opt_inode32, + Opt_inode64, +}; + +static const struct constant_table shmem_param_enums_huge[] = { + {"never", SHMEM_HUGE_NEVER }, + {"always", SHMEM_HUGE_ALWAYS }, + {"within_size", SHMEM_HUGE_WITHIN_SIZE }, + {"advise", SHMEM_HUGE_ADVISE }, + {} +}; + +const struct fs_parameter_spec shmem_fs_parameters[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_enum ("huge", Opt_huge, shmem_param_enums_huge), + fsparam_u32oct("mode", Opt_mode), + fsparam_string("mpol", Opt_mpol), + fsparam_string("nr_blocks", Opt_nr_blocks), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + fsparam_flag ("inode32", Opt_inode32), + fsparam_flag ("inode64", Opt_inode64), + {} +}; + +static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) +{ + struct shmem_options *ctx = fc->fs_private; + struct fs_parse_result result; + unsigned long long size; + char *rest; + int opt; + kuid_t kuid; + kgid_t kgid; + + opt = fs_parse(fc, shmem_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_size: + size = memparse(param->string, &rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages(); + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_value; + ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_blocks: + ctx->blocks = memparse(param->string, &rest); + if (*rest || ctx->blocks > S64_MAX) + goto bad_value; + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_inodes: + ctx->inodes = memparse(param->string, &rest); + if (*rest) + goto bad_value; + ctx->seen |= SHMEM_SEEN_INODES; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 07777; + break; + case Opt_uid: + kuid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(kuid)) + goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) + goto bad_value; + + ctx->uid = kuid; + break; + case Opt_gid: + kgid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(kgid)) + goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) + goto bad_value; + + ctx->gid = kgid; + break; + case Opt_huge: + ctx->huge = result.uint_32; + if (ctx->huge != SHMEM_HUGE_NEVER && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + has_transparent_hugepage())) + goto unsupported_parameter; + ctx->seen |= SHMEM_SEEN_HUGE; + break; + case Opt_mpol: + if (IS_ENABLED(CONFIG_NUMA)) { + mpol_put(ctx->mpol); + ctx->mpol = NULL; + if (mpol_parse_str(param->string, &ctx->mpol)) + goto bad_value; + break; + } + goto unsupported_parameter; + case Opt_inode32: + ctx->full_inums = false; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + case Opt_inode64: + if (sizeof(ino_t) < 8) { + return invalfc(fc, + "Cannot use inode64 with <64bit inums in kernel\n"); + } + ctx->full_inums = true; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + } + return 0; + +unsupported_parameter: + return invalfc(fc, "Unsupported parameter '%s'", param->key); +bad_value: + return invalfc(fc, "Bad value for '%s'", param->key); +} + +static int shmem_parse_options(struct fs_context *fc, void *data) +{ + char *options = data; + + if (options) { + int err = security_sb_eat_lsm_opts(options, &fc->security); + if (err) + return err; + } + + while (options != NULL) { + char *this_char = options; + for (;;) { + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + options = strchr(options, ','); + if (options == NULL) + break; + options++; + if (!isdigit(*options)) { + options[-1] = '\0'; + break; + } + } + if (*this_char) { + char *value = strchr(this_char, '='); + size_t len = 0; + int err; + + if (value) { + *value++ = '\0'; + len = strlen(value); + } + err = vfs_parse_fs_string(fc, this_char, value, len); + if (err < 0) + return err; + } + } + return 0; +} + +/* + * Reconfigure a shmem filesystem. + * + * Note that we disallow change from limited->unlimited blocks/inodes while any + * are in use; but we must separately disallow unlimited->limited, because in + * that case we have no record of how much is already in use. + */ +static int shmem_reconfigure(struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); + unsigned long inodes; + struct mempolicy *mpol = NULL; + const char *err; + + raw_spin_lock(&sbinfo->stat_lock); + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { + if (!sbinfo->max_blocks) { + err = "Cannot retroactively limit size"; + goto out; + } + if (percpu_counter_compare(&sbinfo->used_blocks, + ctx->blocks) > 0) { + err = "Too small a size for current use"; + goto out; + } + } + if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { + if (!sbinfo->max_inodes) { + err = "Cannot retroactively limit inodes"; + goto out; + } + if (ctx->inodes < inodes) { + err = "Too few inodes for current use"; + goto out; + } + } + + if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && + sbinfo->next_ino > UINT_MAX) { + err = "Current inum too high to switch to 32-bit inums"; + goto out; + } + + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + if (ctx->seen & SHMEM_SEEN_INUMS) + sbinfo->full_inums = ctx->full_inums; + if (ctx->seen & SHMEM_SEEN_BLOCKS) + sbinfo->max_blocks = ctx->blocks; + if (ctx->seen & SHMEM_SEEN_INODES) { + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_inodes = ctx->inodes - inodes; + } + + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (ctx->mpol) { + mpol = sbinfo->mpol; + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ + ctx->mpol = NULL; + } + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); + return 0; +out: + raw_spin_unlock(&sbinfo->stat_lock); + return invalfc(fc, "%s", err); +} + +static int shmem_show_options(struct seq_file *seq, struct dentry *root) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb); + + if (sbinfo->max_blocks != shmem_default_max_blocks()) + seq_printf(seq, ",size=%luk", + sbinfo->max_blocks << (PAGE_SHIFT - 10)); + if (sbinfo->max_inodes != shmem_default_max_inodes()) + seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); + if (sbinfo->mode != (0777 | S_ISVTX)) + seq_printf(seq, ",mode=%03ho", sbinfo->mode); + if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) + seq_printf(seq, ",uid=%u", + from_kuid_munged(&init_user_ns, sbinfo->uid)); + if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) + seq_printf(seq, ",gid=%u", + from_kgid_munged(&init_user_ns, sbinfo->gid)); + + /* + * Showing inode{64,32} might be useful even if it's the system default, + * since then people don't have to resort to checking both here and + * /proc/config.gz to confirm 64-bit inums were successfully applied + * (which may not even exist if IKCONFIG_PROC isn't enabled). + * + * We hide it when inode64 isn't the default and we are using 32-bit + * inodes, since that probably just means the feature isn't even under + * consideration. + * + * As such: + * + * +-----------------+-----------------+ + * | TMPFS_INODE64=y | TMPFS_INODE64=n | + * +------------------+-----------------+-----------------+ + * | full_inums=true | show | show | + * | full_inums=false | show | hide | + * +------------------+-----------------+-----------------+ + * + */ + if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) + seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ + if (sbinfo->huge) + seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); +#endif + shmem_show_mpol(seq, sbinfo->mpol); + return 0; +} + +#endif /* CONFIG_TMPFS */ + +static void shmem_put_super(struct super_block *sb) +{ + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + free_percpu(sbinfo->ino_batch); + percpu_counter_destroy(&sbinfo->used_blocks); + mpol_put(sbinfo->mpol); + kfree(sbinfo); + sb->s_fs_info = NULL; +} + +static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + struct inode *inode; + struct shmem_sb_info *sbinfo; + + /* Round up to L1_CACHE_BYTES to resist false sharing */ + sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), + L1_CACHE_BYTES), GFP_KERNEL); + if (!sbinfo) + return -ENOMEM; + + sb->s_fs_info = sbinfo; + +#ifdef CONFIG_TMPFS + /* + * Per default we only allow half of the physical ram per + * tmpfs instance, limiting inodes to one per page of lowmem; + * but the internal instance is left unlimited. + */ + if (!(sb->s_flags & SB_KERNMOUNT)) { + if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) + ctx->blocks = shmem_default_max_blocks(); + if (!(ctx->seen & SHMEM_SEEN_INODES)) + ctx->inodes = shmem_default_max_inodes(); + if (!(ctx->seen & SHMEM_SEEN_INUMS)) + ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); + } else { + sb->s_flags |= SB_NOUSER; + } + sb->s_export_op = &shmem_export_ops; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; +#else + sb->s_flags |= SB_NOUSER; +#endif + sbinfo->max_blocks = ctx->blocks; + sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + if (sb->s_flags & SB_KERNMOUNT) { + sbinfo->ino_batch = alloc_percpu(ino_t); + if (!sbinfo->ino_batch) + goto failed; + } + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->full_inums = ctx->full_inums; + sbinfo->mode = ctx->mode; + sbinfo->huge = ctx->huge; + sbinfo->mpol = ctx->mpol; + ctx->mpol = NULL; + + raw_spin_lock_init(&sbinfo->stat_lock); + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) + goto failed; + spin_lock_init(&sbinfo->shrinklist_lock); + INIT_LIST_HEAD(&sbinfo->shrinklist); + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + sb->s_magic = TMPFS_MAGIC; + sb->s_op = &shmem_ops; + sb->s_time_gran = 1; +#ifdef CONFIG_TMPFS_XATTR + sb->s_xattr = shmem_xattr_handlers; +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + sb->s_flags |= SB_POSIXACL; +#endif + uuid_gen(&sb->s_uuid); + + inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + if (!inode) + goto failed; + inode->i_uid = sbinfo->uid; + inode->i_gid = sbinfo->gid; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto failed; + return 0; + +failed: + shmem_put_super(sb); + return -ENOMEM; +} + +static int shmem_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, shmem_fill_super); +} + +static void shmem_free_fc(struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + + if (ctx) { + mpol_put(ctx->mpol); + kfree(ctx); + } +} + +static const struct fs_context_operations shmem_fs_context_ops = { + .free = shmem_free_fc, + .get_tree = shmem_get_tree, +#ifdef CONFIG_TMPFS + .parse_monolithic = shmem_parse_options, + .parse_param = shmem_parse_one, + .reconfigure = shmem_reconfigure, +#endif +}; + +static struct kmem_cache *shmem_inode_cachep; + +static struct inode *shmem_alloc_inode(struct super_block *sb) +{ + struct shmem_inode_info *info; + info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); + if (!info) + return NULL; + return &info->vfs_inode; +} + +static void shmem_free_in_core_inode(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + +static void shmem_destroy_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode)) + mpol_free_shared_policy(&SHMEM_I(inode)->policy); +} + +static void shmem_init_inode(void *foo) +{ + struct shmem_inode_info *info = foo; + inode_init_once(&info->vfs_inode); +} + +static void shmem_init_inodecache(void) +{ + shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", + sizeof(struct shmem_inode_info), + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); +} + +static void shmem_destroy_inodecache(void) +{ + kmem_cache_destroy(shmem_inode_cachep); +} + +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + +const struct address_space_operations shmem_aops = { + .writepage = shmem_writepage, + .dirty_folio = noop_dirty_folio, +#ifdef CONFIG_TMPFS + .write_begin = shmem_write_begin, + .write_end = shmem_write_end, +#endif +#ifdef CONFIG_MIGRATION + .migrate_folio = migrate_folio, +#endif + .error_remove_page = shmem_error_remove_page, +}; +EXPORT_SYMBOL(shmem_aops); + +static const struct file_operations shmem_file_operations = { + .mmap = shmem_mmap, + .get_unmapped_area = shmem_get_unmapped_area, +#ifdef CONFIG_TMPFS + .llseek = shmem_file_llseek, + .read_iter = shmem_file_read_iter, + .write_iter = generic_file_write_iter, + .fsync = noop_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = shmem_fallocate, +#endif +}; + +static const struct inode_operations shmem_inode_operations = { + .getattr = shmem_getattr, + .setattr = shmem_setattr, +#ifdef CONFIG_TMPFS_XATTR + .listxattr = shmem_listxattr, + .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, +#endif +}; + +static const struct inode_operations shmem_dir_inode_operations = { +#ifdef CONFIG_TMPFS + .getattr = shmem_getattr, + .create = shmem_create, + .lookup = simple_lookup, + .link = shmem_link, + .unlink = shmem_unlink, + .symlink = shmem_symlink, + .mkdir = shmem_mkdir, + .rmdir = shmem_rmdir, + .mknod = shmem_mknod, + .rename = shmem_rename2, + .tmpfile = shmem_tmpfile, +#endif +#ifdef CONFIG_TMPFS_XATTR + .listxattr = shmem_listxattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_setattr, + .set_acl = simple_set_acl, +#endif +}; + +static const struct inode_operations shmem_special_inode_operations = { + .getattr = shmem_getattr, +#ifdef CONFIG_TMPFS_XATTR + .listxattr = shmem_listxattr, +#endif +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_setattr, + .set_acl = simple_set_acl, +#endif +}; + +static const struct super_operations shmem_ops = { + .alloc_inode = shmem_alloc_inode, + .free_inode = shmem_free_in_core_inode, + .destroy_inode = shmem_destroy_inode, +#ifdef CONFIG_TMPFS + .statfs = shmem_statfs, + .show_options = shmem_show_options, +#endif + .evict_inode = shmem_evict_inode, + .drop_inode = generic_delete_inode, + .put_super = shmem_put_super, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + .nr_cached_objects = shmem_unused_huge_count, + .free_cached_objects = shmem_unused_huge_scan, +#endif +}; + +static const struct vm_operations_struct shmem_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + +int shmem_init_fs_context(struct fs_context *fc) +{ + struct shmem_options *ctx; + + ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->mode = 0777 | S_ISVTX; + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + + fc->fs_private = ctx; + fc->ops = &shmem_fs_context_ops; + return 0; +} + +static struct file_system_type shmem_fs_type = { + .owner = THIS_MODULE, + .name = "tmpfs", + .init_fs_context = shmem_init_fs_context, +#ifdef CONFIG_TMPFS + .parameters = shmem_fs_parameters, +#endif + .kill_sb = kill_litter_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +void __init shmem_init(void) +{ + int error; + + shmem_init_inodecache(); + + error = register_filesystem(&shmem_fs_type); + if (error) { + pr_err("Could not register tmpfs\n"); + goto out2; + } + + shm_mnt = kern_mount(&shmem_fs_type); + if (IS_ERR(shm_mnt)) { + error = PTR_ERR(shm_mnt); + pr_err("Could not kern_mount tmpfs\n"); + goto out1; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + else + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ +#endif + return; + +out1: + unregister_filesystem(&shmem_fs_type); +out2: + shmem_destroy_inodecache(); + shm_mnt = ERR_PTR(error); +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) +static ssize_t shmem_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + static const int values[] = { + SHMEM_HUGE_ALWAYS, + SHMEM_HUGE_WITHIN_SIZE, + SHMEM_HUGE_ADVISE, + SHMEM_HUGE_NEVER, + SHMEM_HUGE_DENY, + SHMEM_HUGE_FORCE, + }; + int len = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(values); i++) { + len += sysfs_emit_at(buf, len, + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", + shmem_format_huge(values[i])); + } + + len += sysfs_emit_at(buf, len, "\n"); + + return len; +} + +static ssize_t shmem_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + char tmp[16]; + int huge; + + if (count + 1 > sizeof(tmp)) + return -EINVAL; + memcpy(tmp, buf, count); + tmp[count] = '\0'; + if (count && tmp[count - 1] == '\n') + tmp[count - 1] = '\0'; + + huge = shmem_parse_huge(tmp); + if (huge == -EINVAL) + return -EINVAL; + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + shmem_huge = huge; + if (shmem_huge > SHMEM_HUGE_DENY) + SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; + return count; +} + +struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ + +#else /* !CONFIG_SHMEM */ + +/* + * tiny-shmem: simple shmemfs and tmpfs using ramfs code + * + * This is intended for small system where the benefits of the full + * shmem code (swap-backed and resource-limited) are outweighed by + * their complexity. On systems without swap this code should be + * effectively equivalent, but much lighter weight. + */ + +static struct file_system_type shmem_fs_type = { + .name = "tmpfs", + .init_fs_context = ramfs_init_fs_context, + .parameters = ramfs_fs_parameters, + .kill_sb = ramfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, +}; + +void __init shmem_init(void) +{ + BUG_ON(register_filesystem(&shmem_fs_type) != 0); + + shm_mnt = kern_mount(&shmem_fs_type); + BUG_ON(IS_ERR(shm_mnt)); +} + +int shmem_unuse(unsigned int type) +{ + return 0; +} + +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) +{ + return 0; +} + +void shmem_unlock_mapping(struct address_space *mapping) +{ +} + +#ifdef CONFIG_MMU +unsigned long shmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); +} +#endif + +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + truncate_inode_pages_range(inode->i_mapping, lstart, lend); +} +EXPORT_SYMBOL_GPL(shmem_truncate_range); + +#define shmem_vm_ops generic_file_vm_ops +#define shmem_file_operations ramfs_file_operations +#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) +#define shmem_acct_size(flags, size) 0 +#define shmem_unacct_size(flags, size) do {} while (0) + +#endif /* CONFIG_SHMEM */ + +/* common code */ + +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, + unsigned long flags, unsigned int i_flags) +{ + struct inode *inode; + struct file *res; + + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + if (size < 0 || size > MAX_LFS_FILESIZE) + return ERR_PTR(-EINVAL); + + if (shmem_acct_size(flags, size)) + return ERR_PTR(-ENOMEM); + + inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, + flags); + if (unlikely(!inode)) { + shmem_unacct_size(flags, size); + return ERR_PTR(-ENOSPC); + } + inode->i_flags |= i_flags; + inode->i_size = size; + clear_nlink(inode); /* It is unlinked */ + res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); + if (!IS_ERR(res)) + res = alloc_file_pseudo(inode, mnt, name, O_RDWR, + &shmem_file_operations); + if (IS_ERR(res)) + iput(inode); + return res; +} + +/** + * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be + * kernel internal. There will be NO LSM permission checks against the + * underlying inode. So users of this interface must do LSM checks at a + * higher layer. The users are the big_key and shm implementations. LSM + * checks are provided at the key or shm level rather than the inode. + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); +} + +/** + * shmem_file_setup - get an unlinked file living in tmpfs + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) +{ + return __shmem_file_setup(shm_mnt, name, size, flags, 0); +} +EXPORT_SYMBOL_GPL(shmem_file_setup); + +/** + * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs + * @mnt: the tmpfs mount where the file will be created + * @name: name for dentry (to be seen in /proc//maps + * @size: size to be set for the file + * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size + */ +struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long flags) +{ + return __shmem_file_setup(mnt, name, size, flags, 0); +} +EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); + +/** + * shmem_zero_setup - setup a shared anonymous mapping + * @vma: the vma to be mmapped is prepared by do_mmap + */ +int shmem_zero_setup(struct vm_area_struct *vma) +{ + struct file *file; + loff_t size = vma->vm_end - vma->vm_start; + + /* + * Cloning a new file under mmap_lock leads to a lock ordering conflict + * between XFS directory reading and selinux: since this file is only + * accessible to the user through its mapping, use S_PRIVATE flag to + * bypass file security, in the same way as shmem_kernel_file_setup(). + */ + file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + vma->vm_ops = &shmem_vm_ops; + + return 0; +} + +/** + * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", + * with any new page allocations done using the specified allocation flags. + * But read_cache_page_gfp() uses the ->read_folio() method: which does not + * suit tmpfs, since it may have pages in swapcache, and needs to find those + * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. + * + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. + */ +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ +#ifdef CONFIG_SHMEM + struct inode *inode = mapping->host; + struct folio *folio; + struct page *page; + int error; + + BUG_ON(!shmem_mapping(mapping)); + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, + gfp, NULL, NULL, NULL); + if (error) + return ERR_PTR(error); + + folio_unlock(folio); + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + + return page; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ + return read_cache_page_gfp(mapping, index, gfp); +#endif +} +EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c new file mode 100644 index 000000000..39c3491e2 --- /dev/null +++ b/mm/shrinker_debug.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +/* defined in vmscan.c */ +extern struct rw_semaphore shrinker_rwsem; +extern struct list_head shrinker_list; + +static DEFINE_IDA(shrinker_debugfs_ida); +static struct dentry *shrinker_debugfs_root; + +static unsigned long shrinker_count_objects(struct shrinker *shrinker, + struct mem_cgroup *memcg, + unsigned long *count_per_node) +{ + unsigned long nr, total = 0; + int nid; + + for_each_node(nid) { + if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + .nid = nid, + .memcg = memcg, + }; + + nr = shrinker->count_objects(shrinker, &sc); + if (nr == SHRINK_EMPTY) + nr = 0; + } else { + nr = 0; + } + + count_per_node[nid] = nr; + total += nr; + } + + return total; +} + +static int shrinker_debugfs_count_show(struct seq_file *m, void *v) +{ + struct shrinker *shrinker = m->private; + unsigned long *count_per_node; + struct mem_cgroup *memcg; + unsigned long total; + bool memcg_aware; + int ret, nid; + + count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); + if (!count_per_node) + return -ENOMEM; + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); + + memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (memcg && !mem_cgroup_online(memcg)) + continue; + + total = shrinker_count_objects(shrinker, + memcg_aware ? memcg : NULL, + count_per_node); + if (total) { + seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + for_each_node(nid) + seq_printf(m, " %lu", count_per_node[nid]); + seq_putc(m, '\n'); + } + + if (!memcg_aware) { + mem_cgroup_iter_break(NULL, memcg); + break; + } + + if (signal_pending(current)) { + mem_cgroup_iter_break(NULL, memcg); + ret = -EINTR; + break; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + + rcu_read_unlock(); + up_read(&shrinker_rwsem); + + kfree(count_per_node); + return ret; +} +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); + +static int shrinker_debugfs_scan_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t shrinker_debugfs_scan_write(struct file *file, + const char __user *buf, + size_t size, loff_t *pos) +{ + struct shrinker *shrinker = file->private_data; + unsigned long nr_to_scan = 0, ino, read_len; + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + }; + struct mem_cgroup *memcg = NULL; + int nid; + char kbuf[72]; + ssize_t ret; + + read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + + if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3) + return -EINVAL; + + if (nid < 0 || nid >= nr_node_ids) + return -EINVAL; + + if (nr_to_scan == 0) + return size; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + memcg = mem_cgroup_get_from_ino(ino); + if (!memcg || IS_ERR(memcg)) + return -ENOENT; + + if (!mem_cgroup_online(memcg)) { + mem_cgroup_put(memcg); + return -ENOENT; + } + } else if (ino != 0) { + return -EINVAL; + } + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } + + sc.nid = nid; + sc.memcg = memcg; + sc.nr_to_scan = nr_to_scan; + sc.nr_scanned = nr_to_scan; + + shrinker->scan_objects(shrinker, &sc); + + up_read(&shrinker_rwsem); + mem_cgroup_put(memcg); + + return size; +} + +static const struct file_operations shrinker_debugfs_scan_fops = { + .owner = THIS_MODULE, + .open = shrinker_debugfs_scan_open, + .write = shrinker_debugfs_scan_write, +}; + +int shrinker_debugfs_add(struct shrinker *shrinker) +{ + struct dentry *entry; + char buf[128]; + int id; + + lockdep_assert_held(&shrinker_rwsem); + + /* debugfs isn't initialized yet, add debugfs entries later. */ + if (!shrinker_debugfs_root) + return 0; + + id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL); + if (id < 0) + return id; + shrinker->debugfs_id = id; + + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id); + + /* create debugfs entry */ + entry = debugfs_create_dir(buf, shrinker_debugfs_root); + if (IS_ERR(entry)) { + ida_free(&shrinker_debugfs_ida, id); + return PTR_ERR(entry); + } + shrinker->debugfs_entry = entry; + + debugfs_create_file("count", 0220, entry, shrinker, + &shrinker_debugfs_count_fops); + debugfs_create_file("scan", 0440, entry, shrinker, + &shrinker_debugfs_scan_fops); + return 0; +} + +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + struct dentry *entry; + char buf[128]; + const char *new, *old; + va_list ap; + int ret = 0; + + va_start(ap, fmt); + new = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!new) + return -ENOMEM; + + down_write(&shrinker_rwsem); + + old = shrinker->name; + shrinker->name = new; + + if (shrinker->debugfs_entry) { + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, + shrinker->debugfs_id); + + entry = debugfs_rename(shrinker_debugfs_root, + shrinker->debugfs_entry, + shrinker_debugfs_root, buf); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + else + shrinker->debugfs_entry = entry; + } + + up_write(&shrinker_rwsem); + + kfree_const(old); + + return ret; +} +EXPORT_SYMBOL(shrinker_debugfs_rename); + +struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) +{ + struct dentry *entry = shrinker->debugfs_entry; + + lockdep_assert_held(&shrinker_rwsem); + + kfree_const(shrinker->name); + shrinker->name = NULL; + + if (entry) { + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); + shrinker->debugfs_entry = NULL; + } + + return entry; +} + +static int __init shrinker_debugfs_init(void) +{ + struct shrinker *shrinker; + struct dentry *dentry; + int ret = 0; + + dentry = debugfs_create_dir("shrinker", NULL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + shrinker_debugfs_root = dentry; + + /* Create debugfs entries for shrinkers registered at boot */ + down_write(&shrinker_rwsem); + list_for_each_entry(shrinker, &shrinker_list, list) + if (!shrinker->debugfs_entry) { + ret = shrinker_debugfs_add(shrinker); + if (ret) + break; + } + up_write(&shrinker_rwsem); + + return ret; +} +late_initcall(shrinker_debugfs_init); diff --git a/mm/shuffle.c b/mm/shuffle.c new file mode 100644 index 000000000..fb1393b8b --- /dev/null +++ b/mm/shuffle.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#include +#include +#include +#include +#include +#include "internal.h" +#include "shuffle.h" + +DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); + +static bool shuffle_param; + +static __meminit int shuffle_param_set(const char *val, + const struct kernel_param *kp) +{ + if (param_set_bool(val, kp)) + return -EINVAL; + if (*(bool *)kp->arg) + static_branch_enable(&page_alloc_shuffle_key); + return 0; +} + +static const struct kernel_param_ops shuffle_param_ops = { + .set = shuffle_param_set, + .get = param_get_bool, +}; +module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400); + +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __meminit shuffle_valid_page(struct zone *zone, + unsigned long pfn, int order) +{ + struct page *page = pfn_to_online_page(pfn); + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ... is the page managed by the buddy? */ + if (!page) + return NULL; + + /* ... is the page assigned to the same zone? */ + if (page_zone(page) != zone) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (buddy_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through an + * array, pfns in this case, and randomly swapping each entry with another in + * the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for sources + * of bias in the distribution, like modulo bias or pseudo-random number + * generator bias. I.e. the expectation is that this shuffling raises the bar + * for attacks that exploit the predictability of page allocations, but need not + * be a perfect shuffle. + */ +#define SHUFFLE_RETRY 10 +void __meminit __shuffle_zone(struct zone *z) +{ + unsigned long i, flags; + unsigned long start_pfn = z->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(z); + const int order = SHUFFLE_ORDER; + const int order_pages = 1 << order; + + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being added + * (@start_pfn to @end_pfn), to more likely be valid compared to + * page_j randomly selected in the span @zone_start_pfn to + * @spanned_pages. + */ + page_i = shuffle_valid_page(z, i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page in the zone span as + * a swap target. If the selected pfn is a hole, retry + * up to SHUFFLE_RETRY attempts find a random valid pfn + * in the zone. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(z, j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make sure the + * types match otherwise we're moving pages to lists where they + * do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +/* + * __shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + */ +void __meminit __shuffle_free_memory(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z); +} + +bool shuffle_pick_tail(void) +{ + static u64 rand; + static u8 rand_bits; + bool ret; + + /* + * The lack of locking is deliberate. If 2 threads race to + * update the rand state it just adds to the entropy. + */ + if (rand_bits == 0) { + rand_bits = 64; + rand = get_random_u64(); + } + + ret = rand & 1; + + rand_bits--; + rand >>= 1; + + return ret; +} diff --git a/mm/shuffle.h b/mm/shuffle.h new file mode 100644 index 000000000..cec62984f --- /dev/null +++ b/mm/shuffle.h @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. +#ifndef _MM_SHUFFLE_H +#define _MM_SHUFFLE_H +#include + +#define SHUFFLE_ORDER (MAX_ORDER-1) + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +extern void __shuffle_free_memory(pg_data_t *pgdat); +extern bool shuffle_pick_tail(void); +static inline void __meminit shuffle_free_memory(pg_data_t *pgdat) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_free_memory(pgdat); +} + +extern void __shuffle_zone(struct zone *z); +static inline void __meminit shuffle_zone(struct zone *z) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_zone(z); +} + +static inline bool is_shuffle_order(int order) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return false; + return order >= SHUFFLE_ORDER; +} +#else +static inline bool shuffle_pick_tail(void) +{ + return false; +} + +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ +} + +static inline void shuffle_zone(struct zone *z) +{ +} + +static inline bool is_shuffle_order(int order) +{ + return false; +} +#endif +#endif /* _MM_SHUFFLE_H */ diff --git a/mm/slab.c b/mm/slab.c new file mode 100644 index 000000000..62869bc3c --- /dev/null +++ b/mm/slab.c @@ -0,0 +1,4053 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/slab.c + * Written by Mark Hemment, 1996/97. + * (markhe@nextd.demon.co.uk) + * + * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli + * + * Major cleanup, different bufctl logic, per-cpu arrays + * (c) 2000 Manfred Spraul + * + * Cleanup, make the head arrays unconditional, preparation for NUMA + * (c) 2002 Manfred Spraul + * + * An implementation of the Slab Allocator as described in outline in; + * UNIX Internals: The New Frontiers by Uresh Vahalia + * Pub: Prentice Hall ISBN 0-13-101908-2 + * or with a little more detail in; + * The Slab Allocator: An Object-Caching Kernel Memory Allocator + * Jeff Bonwick (Sun Microsystems). + * Presented at: USENIX Summer 1994 Technical Conference + * + * The memory is organized in caches, one cache for each object type. + * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) + * Each cache consists out of many slabs (they are small (usually one + * page long) and always contiguous), and each slab contains multiple + * initialized objects. + * + * This means, that your constructor is used only for newly allocated + * slabs and you must pass objects with the same initializations to + * kmem_cache_free. + * + * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, + * normal). If you need a special memory type, then must create a new + * cache for that memory type. + * + * In order to reduce fragmentation, the slabs are sorted in 3 groups: + * full slabs with 0 free objects + * partial slabs + * empty slabs with no allocated objects + * + * If partial slabs exist, then new allocations come from these slabs, + * otherwise from empty slabs or new slabs are allocated. + * + * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache + * during kmem_cache_destroy(). The caller must prevent concurrent allocs. + * + * Each cache has a short per-cpu head array, most allocs + * and frees go into that array, and if that array overflows, then 1/2 + * of the entries in the array are given back into the global cache. + * The head array is strictly LIFO and should improve the cache hit rates. + * On SMP, it additionally reduces the spinlock operations. + * + * The c_cpuarray may not be read with enabled local interrupts - + * it's changed with a smp_call_function(). + * + * SMP synchronization: + * constructors and destructors are called without any locking. + * Several members in struct kmem_cache and struct slab never change, they + * are accessed without any locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * and local interrupts are disabled so slab code is preempt-safe. + * The non-constant members are protected with a per-cache irq spinlock. + * + * Many thanks to Mark Hemment, who wrote another per-cpu slab patch + * in 2000 - many ideas in the current implementation are derived from + * his patch. + * + * Further notes from the original documentation: + * + * 11 April '97. Started multi-threading - markhe + * The global cache-chain is protected by the mutex 'slab_mutex'. + * The sem is only needed when accessing/extending the cache-chain, which + * can never happen inside an interrupt (kmem_cache_create(), + * kmem_cache_shrink() and kmem_cache_reap()). + * + * At present, each engine can be growing a cache. This should be blocked. + * + * 15 March 2005. NUMA slab allocator. + * Shai Fultheim . + * Shobhit Dayal + * Alok N Kataria + * Christoph Lameter + * + * Modified the slab allocator to be node aware on NUMA systems. + * Each node has its own list of partial, free and full slabs. + * All object allocations for a node occur from node specific slab lists. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include + +#include "internal.h" + +#include "slab.h" + +/* + * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. + * 0 for faster, smaller code (especially in the critical paths). + * + * STATS - 1 to collect stats for /proc/slabinfo. + * 0 for faster, smaller code (especially in the critical paths). + * + * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) + */ + +#ifdef CONFIG_DEBUG_SLAB +#define DEBUG 1 +#define STATS 1 +#define FORCED_DEBUG 1 +#else +#define DEBUG 0 +#define STATS 0 +#define FORCED_DEBUG 0 +#endif + +/* Shouldn't this be in a header file somewhere? */ +#define BYTES_PER_WORD sizeof(void *) +#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) + +#ifndef ARCH_KMALLOC_FLAGS +#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN +#endif + +#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \ + <= SLAB_OBJ_MIN_SIZE) ? 1 : 0) + +#if FREELIST_BYTE_INDEX +typedef unsigned char freelist_idx_t; +#else +typedef unsigned short freelist_idx_t; +#endif + +#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) + +/* + * struct array_cache + * + * Purpose: + * - LIFO ordering, to hand out cache-warm objects from _alloc + * - reduce the number of linked list operations + * - reduce spinlock operations + * + * The limit is stored in the per-cpu structure to reduce the data cache + * footprint. + * + */ +struct array_cache { + unsigned int avail; + unsigned int limit; + unsigned int batchcount; + unsigned int touched; + void *entry[]; /* + * Must have this definition in here for the proper + * alignment of array_cache. Also simplifies accessing + * the entries. + */ +}; + +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + +/* + * Need this for bootstrapping a per node allocator. + */ +#define NUM_INIT_LISTS (2 * MAX_NUMNODES) +static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; +#define CACHE_CACHE 0 +#define SIZE_NODE (MAX_NUMNODES) + +static int drain_freelist(struct kmem_cache *cache, + struct kmem_cache_node *n, int tofree); +static void free_block(struct kmem_cache *cachep, void **objpp, int len, + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); +static void cache_reap(struct work_struct *unused); + +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list); +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct slab *slab, + void **list); +static int slab_early_init = 1; + +#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) + +static void kmem_cache_node_init(struct kmem_cache_node *parent) +{ + INIT_LIST_HEAD(&parent->slabs_full); + INIT_LIST_HEAD(&parent->slabs_partial); + INIT_LIST_HEAD(&parent->slabs_free); + parent->total_slabs = 0; + parent->free_slabs = 0; + parent->shared = NULL; + parent->alien = NULL; + parent->colour_next = 0; + spin_lock_init(&parent->list_lock); + parent->free_objects = 0; + parent->free_touched = 0; +} + +#define MAKE_LIST(cachep, listp, slab, nodeid) \ + do { \ + INIT_LIST_HEAD(listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ + } while (0) + +#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ + do { \ + MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ + MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ + MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ + } while (0) + +#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) +#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) +#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) + +#define BATCHREFILL_LIMIT 16 +/* + * Optimization question: fewer reaps means less probability for unnecessary + * cpucache drain/refill cycles. + * + * OTOH the cpuarrays can contain lots of objects, + * which could lock up otherwise freeable slabs. + */ +#define REAPTIMEOUT_AC (2*HZ) +#define REAPTIMEOUT_NODE (4*HZ) + +#if STATS +#define STATS_INC_ACTIVE(x) ((x)->num_active++) +#define STATS_DEC_ACTIVE(x) ((x)->num_active--) +#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) +#define STATS_INC_GROWN(x) ((x)->grown++) +#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) +#define STATS_SET_HIGH(x) \ + do { \ + if ((x)->num_active > (x)->high_mark) \ + (x)->high_mark = (x)->num_active; \ + } while (0) +#define STATS_INC_ERR(x) ((x)->errors++) +#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) +#define STATS_INC_NODEFREES(x) ((x)->node_frees++) +#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) +#define STATS_SET_FREEABLE(x, i) \ + do { \ + if ((x)->max_freeable < i) \ + (x)->max_freeable = i; \ + } while (0) +#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) +#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) +#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) +#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) +#else +#define STATS_INC_ACTIVE(x) do { } while (0) +#define STATS_DEC_ACTIVE(x) do { } while (0) +#define STATS_INC_ALLOCED(x) do { } while (0) +#define STATS_INC_GROWN(x) do { } while (0) +#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) +#define STATS_SET_HIGH(x) do { } while (0) +#define STATS_INC_ERR(x) do { } while (0) +#define STATS_INC_NODEALLOCS(x) do { } while (0) +#define STATS_INC_NODEFREES(x) do { } while (0) +#define STATS_INC_ACOVERFLOW(x) do { } while (0) +#define STATS_SET_FREEABLE(x, i) do { } while (0) +#define STATS_INC_ALLOCHIT(x) do { } while (0) +#define STATS_INC_ALLOCMISS(x) do { } while (0) +#define STATS_INC_FREEHIT(x) do { } while (0) +#define STATS_INC_FREEMISS(x) do { } while (0) +#endif + +#if DEBUG + +/* + * memory layout of objects: + * 0 : objp + * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that + * the end of an object is aligned with the end of the real + * allocation. Catches writes behind the end of the allocation. + * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: + * redzone word. + * cachep->obj_offset: The real object. + * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] + * cachep->size - 1* BYTES_PER_WORD: last caller address + * [BYTES_PER_WORD long] + */ +static int obj_offset(struct kmem_cache *cachep) +{ + return cachep->obj_offset; +} + +static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + return (unsigned long long *) (objp + obj_offset(cachep) - + sizeof(unsigned long long)); +} + +static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); + if (cachep->flags & SLAB_STORE_USER) + return (unsigned long long *)(objp + cachep->size - + sizeof(unsigned long long) - + REDZONE_ALIGN); + return (unsigned long long *) (objp + cachep->size - + sizeof(unsigned long long)); +} + +static void **dbg_userword(struct kmem_cache *cachep, void *objp) +{ + BUG_ON(!(cachep->flags & SLAB_STORE_USER)); + return (void **)(objp + cachep->size - BYTES_PER_WORD); +} + +#else + +#define obj_offset(x) 0 +#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;}) +#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) + +#endif + +/* + * Do not go above this order unless 0 objects fit into the slab or + * overridden on the command line. + */ +#define SLAB_MAX_ORDER_HI 1 +#define SLAB_MAX_ORDER_LO 0 +static int slab_max_order = SLAB_MAX_ORDER_LO; +static bool slab_max_order_set __initdata; + +static inline void *index_to_obj(struct kmem_cache *cache, + const struct slab *slab, unsigned int idx) +{ + return slab->s_mem + cache->size * idx; +} + +#define BOOT_CPUCACHE_ENTRIES 1 +/* internal cache of cache description objs */ +static struct kmem_cache kmem_cache_boot = { + .batchcount = 1, + .limit = BOOT_CPUCACHE_ENTRIES, + .shared = 1, + .size = sizeof(struct kmem_cache), + .name = "kmem_cache", +}; + +static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); + +static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) +{ + return this_cpu_ptr(cachep->cpu_cache); +} + +/* + * Calculate the number of objects and left-over bytes for a given buffer size. + */ +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + slab_flags_t flags, size_t *left_over) +{ + unsigned int num; + size_t slab_size = PAGE_SIZE << gfporder; + + /* + * The slab management structure can be either off the slab or + * on it. For the latter case, the memory allocated for a + * slab is used for: + * + * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. + * + * If the slab management structure is off the slab, then the + * alignment will already be calculated into the size. Because + * the slabs are all pages aligned, the objects will be at the + * correct alignment when allocated. + */ + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { + num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; + } else { + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); + } + + return num; +} + +#if DEBUG +#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) + +static void __slab_error(const char *function, struct kmem_cache *cachep, + char *msg) +{ + pr_err("slab error in %s(): cache `%s': %s\n", + function, cachep->name, msg); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} +#endif + +/* + * By default on NUMA we use alien caches to stage the freeing of + * objects allocated from other nodes. This causes massive memory + * inefficiencies when using fake NUMA setup to split memory into a + * large number of small nodes, so it can be disabled on the command + * line + */ + +static int use_alien_caches __read_mostly = 1; +static int __init noaliencache_setup(char *s) +{ + use_alien_caches = 0; + return 1; +} +__setup("noaliencache", noaliencache_setup); + +static int __init slab_max_order_setup(char *str) +{ + get_option(&str, &slab_max_order); + slab_max_order = slab_max_order < 0 ? 0 : + min(slab_max_order, MAX_ORDER - 1); + slab_max_order_set = true; + + return 1; +} +__setup("slab_max_order=", slab_max_order_setup); + +#ifdef CONFIG_NUMA +/* + * Special reaping functions for NUMA systems called from cache_reap(). + * These take care of doing round robin flushing of alien caches (containing + * objects freed on different nodes from which they were allocated) and the + * flushing of remote pcps by calling drain_node_pages. + */ +static DEFINE_PER_CPU(unsigned long, slab_reap_node); + +static void init_reap_node(int cpu) +{ + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); +} + +static void next_reap_node(void) +{ + int node = __this_cpu_read(slab_reap_node); + + node = next_node_in(node, node_online_map); + __this_cpu_write(slab_reap_node, node); +} + +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + +/* + * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz + * via the workqueue/eventd. + * Add the CPU number into the expiration time to minimize the possibility of + * the CPUs getting into lockstep and contending for the global cache chain + * lock. + */ +static void start_cpu_timer(int cpu) +{ + struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); + + if (reap_work->work.func == NULL) { + init_reap_node(cpu); + INIT_DEFERRABLE_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, + __round_jiffies_relative(HZ, cpu)); + } +} + +static void init_arraycache(struct array_cache *ac, int limit, int batch) +{ + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; + } +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); + init_arraycache(ac, entries, batchcount); + return ac; +} + +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct slab *slab, void *objp) +{ + struct kmem_cache_node *n; + int slab_node; + LIST_HEAD(list); + + slab_node = slab_nid(slab); + n = get_node(cachep, slab_node); + + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, slab_node, &list); + spin_unlock(&n->list_lock); + + slabs_destroy(cachep, &list); +} + +/* + * Transfer objects in one arraycache to another. + * Locking must be handled by the caller. + * + * Return the number of entries transferred. + */ +static int transfer_objects(struct array_cache *to, + struct array_cache *from, unsigned int max) +{ + /* Figure out how many entries to transfer */ + int nr = min3(from->avail, max, to->limit - to->avail); + + if (!nr) + return 0; + + memcpy(to->entry + to->avail, from->entry + from->avail - nr, + sizeof(void *) *nr); + + from->avail -= nr; + to->avail += nr; + return nr; +} + +/* &alien->lock must be held by alien callers. */ +static __always_inline void __free_one(struct array_cache *ac, void *objp) +{ + /* Avoid trivial double-free. */ + if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp)) + return; + ac->entry[ac->avail++] = objp; +} + +#ifndef CONFIG_NUMA + +#define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, n) do { } while (0) + +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) +{ + return NULL; +} + +static inline void free_alien_cache(struct alien_cache **ac_ptr) +{ +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + return 0; +} + +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return flags & ~__GFP_NOFAIL; +} + +#else /* CONFIG_NUMA */ + +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + if (alc) { + kmemleak_no_scan(alc); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + } + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +{ + struct alien_cache **alc_ptr; + int i; + + if (limit > 1) + limit = 12; + alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; + } + } + return alc_ptr; +} + +static void free_alien_cache(struct alien_cache **alc_ptr) +{ + int i; + + if (!alc_ptr) + return; + for_each_node(i) + kfree(alc_ptr[i]); + kfree(alc_ptr); +} + +static void __drain_alien_cache(struct kmem_cache *cachep, + struct array_cache *ac, int node, + struct list_head *list) +{ + struct kmem_cache_node *n = get_node(cachep, node); + + if (ac->avail) { + spin_lock(&n->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects + * into the free lists and getting them back later. + */ + if (n->shared) + transfer_objects(n->shared, ac, ac->limit); + + free_block(cachep, ac->entry, ac->avail, node, list); + ac->avail = 0; + spin_unlock(&n->list_lock); + } +} + +/* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) +{ + int node = __this_cpu_read(slab_reap_node); + + if (n->alien) { + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&alc->lock)) { + LIST_HEAD(list); + + __drain_alien_cache(cachep, ac, node, &list); + spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); + } + } + } +} + +static void drain_alien_cache(struct kmem_cache *cachep, + struct alien_cache **alien) +{ + int i = 0; + struct alien_cache *alc; + struct array_cache *ac; + unsigned long flags; + + for_each_online_node(i) { + alc = alien[i]; + if (alc) { + LIST_HEAD(list); + + ac = &alc->ac; + spin_lock_irqsave(&alc->lock, flags); + __drain_alien_cache(cachep, ac, i, &list); + spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); + } + } +} + +static int __cache_free_alien(struct kmem_cache *cachep, void *objp, + int node, int slab_node) +{ + struct kmem_cache_node *n; + struct alien_cache *alien = NULL; + struct array_cache *ac; + LIST_HEAD(list); + + n = get_node(cachep, node); + STATS_INC_NODEFREES(cachep); + if (n->alien && n->alien[slab_node]) { + alien = n->alien[slab_node]; + ac = &alien->ac; + spin_lock(&alien->lock); + if (unlikely(ac->avail == ac->limit)) { + STATS_INC_ACOVERFLOW(cachep); + __drain_alien_cache(cachep, ac, slab_node, &list); + } + __free_one(ac, objp); + spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); + } else { + n = get_node(cachep, slab_node); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, slab_node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); + } + return 1; +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + int slab_node = slab_nid(virt_to_slab(objp)); + int node = numa_mem_id(); + /* + * Make sure we are not freeing an object from another node to the array + * cache on this cpu. + */ + if (likely(node == slab_node)) + return 0; + + return __cache_free_alien(cachep, objp, node, slab_node); +} + +/* + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. + */ +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); +} +#endif + +static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) +{ + struct kmem_cache_node *n; + + /* + * Set up the kmem_cache_node for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + n = get_node(cachep, node); + if (n) { + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; + spin_unlock_irq(&n->list_lock); + + return 0; + } + + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) + return -ENOMEM; + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + n->free_limit = + (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; + + /* + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex provides sufficient + * protection here. + */ + cachep->node[node] = n; + + return 0; +} + +#if defined(CONFIG_NUMA) || defined(CONFIG_SMP) +/* + * Allocates and initializes node for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing nodes are not replaced if + * already in use. + * + * Must hold slab_mutex. + */ +static int init_cache_node_node(int node) +{ + int ret; + struct kmem_cache *cachep; + + list_for_each_entry(cachep, &slab_caches, list) { + ret = init_cache_node(cachep, node, GFP_KERNEL); + if (ret) + return ret; + } + + return 0; +} +#endif + +static int setup_kmem_cache_node(struct kmem_cache *cachep, + int node, gfp_t gfp, bool force_change) +{ + int ret = -ENOMEM; + struct kmem_cache_node *n; + struct array_cache *old_shared = NULL; + struct array_cache *new_shared = NULL; + struct alien_cache **new_alien = NULL; + LIST_HEAD(list); + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } + + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); + if (!new_shared) + goto fail; + } + + ret = init_cache_node(cachep, node, gfp); + if (ret) + goto fail; + + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); + n->shared->avail = 0; + } + + if (!n->shared || force_change) { + old_shared = n->shared; + n->shared = new_shared; + new_shared = NULL; + } + + if (!n->alien) { + n->alien = new_alien; + new_alien = NULL; + } + + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + + /* + * To protect lockless access to n->shared during irq disabled context. + * If n->shared isn't NULL in irq disabled context, accessing to it is + * guaranteed to be valid until irq is re-enabled, because it will be + * freed after synchronize_rcu(). + */ + if (old_shared && force_change) + synchronize_rcu(); + +fail: + kfree(old_shared); + kfree(new_shared); + free_alien_cache(new_alien); + + return ret; +} + +#ifdef CONFIG_SMP + +static void cpuup_canceled(long cpu) +{ + struct kmem_cache *cachep; + struct kmem_cache_node *n = NULL; + int node = cpu_to_mem(cpu); + const struct cpumask *mask = cpumask_of_node(node); + + list_for_each_entry(cachep, &slab_caches, list) { + struct array_cache *nc; + struct array_cache *shared; + struct alien_cache **alien; + LIST_HEAD(list); + + n = get_node(cachep, node); + if (!n) + continue; + + spin_lock_irq(&n->list_lock); + + /* Free limit for this kmem_cache_node */ + n->free_limit -= cachep->batchcount; + + /* cpu is dead; no one can alloc from it. */ + nc = per_cpu_ptr(cachep->cpu_cache, cpu); + free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; + + if (!cpumask_empty(mask)) { + spin_unlock_irq(&n->list_lock); + goto free_slab; + } + + shared = n->shared; + if (shared) { + free_block(cachep, shared->entry, + shared->avail, node, &list); + n->shared = NULL; + } + + alien = n->alien; + n->alien = NULL; + + spin_unlock_irq(&n->list_lock); + + kfree(shared); + if (alien) { + drain_alien_cache(cachep, alien); + free_alien_cache(alien); + } + +free_slab: + slabs_destroy(cachep, &list); + } + /* + * In the previous loop, all the objects were freed to + * the respective cache's slabs, now we can go ahead and + * shrink each nodelist to its limit. + */ + list_for_each_entry(cachep, &slab_caches, list) { + n = get_node(cachep, node); + if (!n) + continue; + drain_freelist(cachep, n, INT_MAX); + } +} + +static int cpuup_prepare(long cpu) +{ + struct kmem_cache *cachep; + int node = cpu_to_mem(cpu); + int err; + + /* + * We need to do this right in the beginning since + * alloc_arraycache's are going to use this list. + * kmalloc_node allows us to add the slab to the right + * kmem_cache_node and not this cpu's kmem_cache_node + */ + err = init_cache_node_node(node); + if (err < 0) + goto bad; + + /* + * Now we can go ahead with allocating the shared arrays and + * array caches + */ + list_for_each_entry(cachep, &slab_caches, list) { + err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); + if (err) + goto bad; + } + + return 0; +bad: + cpuup_canceled(cpu); + return -ENOMEM; +} + +int slab_prepare_cpu(unsigned int cpu) +{ + int err; + + mutex_lock(&slab_mutex); + err = cpuup_prepare(cpu); + mutex_unlock(&slab_mutex); + return err; +} + +/* + * This is called for a failed online attempt and for a successful + * offline. + * + * Even if all the cpus of a node are down, we don't free the + * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and + * a kmalloc allocation from another cpu for memory from the node of + * the cpu going down. The kmem_cache_node structure is usually allocated from + * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). + */ +int slab_dead_cpu(unsigned int cpu) +{ + mutex_lock(&slab_mutex); + cpuup_canceled(cpu); + mutex_unlock(&slab_mutex); + return 0; +} +#endif + +static int slab_online_cpu(unsigned int cpu) +{ + start_cpu_timer(cpu); + return 0; +} + +static int slab_offline_cpu(unsigned int cpu) +{ + /* + * Shutdown cache reaper. Note that the slab_mutex is held so + * that if cache_reap() is invoked it cannot do anything + * expensive but will only modify reap_work and reschedule the + * timer. + */ + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); + /* Now the cache_reaper is guaranteed to be not running. */ + per_cpu(slab_reap_work, cpu).work.func = NULL; + return 0; +} + +#if defined(CONFIG_NUMA) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold slab_mutex. + */ +static int __meminit drain_cache_node_node(int node) +{ + struct kmem_cache *cachep; + int ret = 0; + + list_for_each_entry(cachep, &slab_caches, list) { + struct kmem_cache_node *n; + + n = get_node(cachep, node); + if (!n) + continue; + + drain_freelist(cachep, n, INT_MAX); + + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) { + ret = -EBUSY; + break; + } + } + return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int ret = 0; + int nid; + + nid = mnb->status_change_nid; + if (nid < 0) + goto out; + + switch (action) { + case MEM_GOING_ONLINE: + mutex_lock(&slab_mutex); + ret = init_cache_node_node(nid); + mutex_unlock(&slab_mutex); + break; + case MEM_GOING_OFFLINE: + mutex_lock(&slab_mutex); + ret = drain_cache_node_node(nid); + mutex_unlock(&slab_mutex); + break; + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } +out: + return notifier_from_errno(ret); +} +#endif /* CONFIG_NUMA */ + +/* + * swap the static kmem_cache_node with kmalloced memory + */ +static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list, + int nodeid) +{ + struct kmem_cache_node *ptr; + + ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid); + BUG_ON(!ptr); + + memcpy(ptr, list, sizeof(struct kmem_cache_node)); + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ + spin_lock_init(&ptr->list_lock); + + MAKE_ALL_LISTS(cachep, ptr, nodeid); + cachep->node[nodeid] = ptr; +} + +/* + * For setting up all the kmem_cache_node for cache whose buffer_size is same as + * size of kmem_cache_node. + */ +static void __init set_up_node(struct kmem_cache *cachep, int index) +{ + int node; + + for_each_online_node(node) { + cachep->node[node] = &init_kmem_cache_node[index + node]; + cachep->node[node]->next_reap = jiffies + + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + } +} + +/* + * Initialisation. Called after the page allocator have been initialised and + * before smp_init(). + */ +void __init kmem_cache_init(void) +{ + int i; + + kmem_cache = &kmem_cache_boot; + + if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) + use_alien_caches = 0; + + for (i = 0; i < NUM_INIT_LISTS; i++) + kmem_cache_node_init(&init_kmem_cache_node[i]); + + /* + * Fragmentation resistance on low memory - only use bigger + * page orders on machines with more than 32MB of memory if + * not overridden on the command line. + */ + if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT) + slab_max_order = SLAB_MAX_ORDER_HI; + + /* Bootstrap is tricky, because several objects are allocated + * from caches that do not exist yet: + * 1) initialize the kmem_cache cache: it contains the struct + * kmem_cache structures of all caches, except kmem_cache itself: + * kmem_cache is statically allocated. + * Initially an __init data area is used for the head array and the + * kmem_cache_node structures, it's replaced with a kmalloc allocated + * array at the end of the bootstrap. + * 2) Create the first kmalloc cache. + * The struct kmem_cache for the new cache is allocated normally. + * An __init data area is used for the head array. + * 3) Create the remaining kmalloc caches, with minimally sized + * head arrays. + * 4) Replace the __init data head arrays for kmem_cache and the first + * kmalloc cache with kmalloc allocated arrays. + * 5) Replace the __init data for kmem_cache_node for kmem_cache and + * the other cache's with kmalloc allocated memory. + * 6) Resize the head arrays of the kmalloc caches to their final sizes. + */ + + /* 1) create the kmem_cache */ + + /* + * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids + */ + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN, 0, 0); + list_add(&kmem_cache->list, &slab_caches); + slab_state = PARTIAL; + + /* + * Initialize the caches that provide memory for the kmem_cache_node + * structures first. Without this, further allocations will bug. + */ + kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( + kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL], + kmalloc_info[INDEX_NODE].size, + ARCH_KMALLOC_FLAGS, 0, + kmalloc_info[INDEX_NODE].size); + slab_state = PARTIAL_NODE; + setup_kmalloc_cache_index_table(); + + slab_early_init = 0; + + /* 5) Replace the bootstrap kmem_cache_node */ + { + int nid; + + for_each_online_node(nid) { + init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); + + init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE], + &init_kmem_cache_node[SIZE_NODE + nid], nid); + } + } + + create_kmalloc_caches(ARCH_KMALLOC_FLAGS); +} + +void __init kmem_cache_init_late(void) +{ + struct kmem_cache *cachep; + + /* 6) resize the head arrays to their final sizes */ + mutex_lock(&slab_mutex); + list_for_each_entry(cachep, &slab_caches, list) + if (enable_cpucache(cachep, GFP_NOWAIT)) + BUG(); + mutex_unlock(&slab_mutex); + + /* Done! */ + slab_state = FULL; + +#ifdef CONFIG_NUMA + /* + * Register a memory hotplug callback that initializes and frees + * node. + */ + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + + /* + * The reap timers are started later, with a module init call: That part + * of the kernel is not yet operational. + */ +} + +static int __init cpucache_init(void) +{ + int ret; + + /* + * Register the timers that return unneeded pages to the page allocator + */ + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "SLAB online", + slab_online_cpu, slab_offline_cpu); + WARN_ON(ret < 0); + + return 0; +} +__initcall(cpucache_init); + +static noinline void +slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) +{ +#if DEBUG + struct kmem_cache_node *n; + unsigned long flags; + int node; + static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) + return; + + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", + cachep->name, cachep->size, cachep->gfporder); + + for_each_kmem_cache_node(cachep, node, n) { + unsigned long total_slabs, free_slabs, free_objs; + + spin_lock_irqsave(&n->list_lock, flags); + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; + spin_unlock_irqrestore(&n->list_lock, flags); + + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, + (total_slabs * cachep->num) - free_objs, + total_slabs * cachep->num); + } +#endif +} + +/* + * Interface to system's page allocator. No need to hold the + * kmem_cache_node ->list_lock. + * + * If we requested dmaable memory, we will get it. Even if we + * did not request dmaable memory, we might get it, but that + * would be relatively rare and ignorable. + */ +static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, + int nodeid) +{ + struct folio *folio; + struct slab *slab; + + flags |= cachep->allocflags; + + folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder); + if (!folio) { + slab_out_of_memory(cachep, flags, nodeid); + return NULL; + } + + slab = folio_slab(folio); + + account_slab(slab, cachep->gfporder, cachep, flags); + __folio_set_slab(folio); + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) + slab_set_pfmemalloc(slab); + + return slab; +} + +/* + * Interface to system's page release. + */ +static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) +{ + int order = cachep->gfporder; + struct folio *folio = slab_folio(slab); + + BUG_ON(!folio_test_slab(folio)); + __slab_clear_pfmemalloc(slab); + __folio_clear_slab(folio); + page_mapcount_reset(folio_page(folio, 0)); + folio->mapping = NULL; + + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; + unaccount_slab(slab, order, cachep); + __free_pages(folio_page(folio, 0), order); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct kmem_cache *cachep; + struct slab *slab; + + slab = container_of(head, struct slab, rcu_head); + cachep = slab->slab_cache; + + kmem_freepages(cachep, slab); +} + +#if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map) {} + +#endif + +static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) +{ + int size = cachep->object_size; + addr = &((char *)addr)[obj_offset(cachep)]; + + memset(addr, val, size); + *(unsigned char *)(addr + size - 1) = POISON_END; +} + +static void dump_line(char *data, int offset, int limit) +{ + int i; + unsigned char error = 0; + int bad_count = 0; + + pr_err("%03x: ", offset); + for (i = 0; i < limit; i++) { + if (data[offset + i] != POISON_FREE) { + error = data[offset + i]; + bad_count++; + } + } + print_hex_dump(KERN_CONT, "", 0, 16, 1, + &data[offset], limit, 1); + + if (bad_count == 1) { + error ^= POISON_FREE; + if (!(error & (error - 1))) { + pr_err("Single bit error detected. Probably bad RAM.\n"); +#ifdef CONFIG_X86 + pr_err("Run memtest86+ or a similar memory test tool.\n"); +#else + pr_err("Run a memory test tool.\n"); +#endif + } + } +} +#endif + +#if DEBUG + +static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) +{ + int i, size; + char *realobj; + + if (cachep->flags & SLAB_RED_ZONE) { + pr_err("Redzone: 0x%llx/0x%llx\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + + if (cachep->flags & SLAB_STORE_USER) + pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp)); + realobj = (char *)objp + obj_offset(cachep); + size = cachep->object_size; + for (i = 0; i < size && lines; i += 16, lines--) { + int limit; + limit = 16; + if (i + limit > size) + limit = size - i; + dump_line(realobj, i, limit); + } +} + +static void check_poison_obj(struct kmem_cache *cachep, void *objp) +{ + char *realobj; + int size, i; + int lines = 0; + + if (is_debug_pagealloc_cache(cachep)) + return; + + realobj = (char *)objp + obj_offset(cachep); + size = cachep->object_size; + + for (i = 0; i < size; i++) { + char exp = POISON_FREE; + if (i == size - 1) + exp = POISON_END; + if (realobj[i] != exp) { + int limit; + /* Mismatch ! */ + /* Print header */ + if (lines == 0) { + pr_err("Slab corruption (%s): %s start=%px, len=%d\n", + print_tainted(), cachep->name, + realobj, size); + print_objinfo(cachep, objp, 0); + } + /* Hexdump the affected line */ + i = (i / 16) * 16; + limit = 16; + if (i + limit > size) + limit = size - i; + dump_line(realobj, i, limit); + i += 16; + lines++; + /* Limit to 5 lines */ + if (lines > 5) + break; + } + } + if (lines != 0) { + /* Print some data about the neighboring objects, if they + * exist: + */ + struct slab *slab = virt_to_slab(objp); + unsigned int objnr; + + objnr = obj_to_index(cachep, slab, objp); + if (objnr) { + objp = index_to_obj(cachep, slab, objnr - 1); + realobj = (char *)objp + obj_offset(cachep); + pr_err("Prev obj: start=%px, len=%d\n", realobj, size); + print_objinfo(cachep, objp, 2); + } + if (objnr + 1 < cachep->num) { + objp = index_to_obj(cachep, slab, objnr + 1); + realobj = (char *)objp + obj_offset(cachep); + pr_err("Next obj: start=%px, len=%d\n", realobj, size); + print_objinfo(cachep, objp, 2); + } + } +} +#endif + +#if DEBUG +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct slab *slab) +{ + int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, slab->freelist - obj_offset(cachep), + POISON_FREE); + } + + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, slab, i); + + if (cachep->flags & SLAB_POISON) { + check_poison_obj(cachep, objp); + slab_kernel_map(cachep, objp, 1); + } + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "start of a freed object was overwritten"); + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "end of a freed object was overwritten"); + } + } +} +#else +static void slab_destroy_debugcheck(struct kmem_cache *cachep, + struct slab *slab) +{ +} +#endif + +/** + * slab_destroy - destroy and release all objects in a slab + * @cachep: cache pointer being destroyed + * @slab: slab being destroyed + * + * Destroy all the objs in a slab, and release the mem back to the system. + * Before calling the slab must have been unlinked from the cache. The + * kmem_cache_node ->list_lock is not held/needed. + */ +static void slab_destroy(struct kmem_cache *cachep, struct slab *slab) +{ + void *freelist; + + freelist = slab->freelist; + slab_destroy_debugcheck(cachep, slab); + if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) + call_rcu(&slab->rcu_head, kmem_rcu_free); + else + kmem_freepages(cachep, slab); + + /* + * From now on, we don't use freelist + * although actual page can be freed in rcu context + */ + if (OFF_SLAB(cachep)) + kfree(freelist); +} + +/* + * Update the size of the caches before calling slabs_destroy as it may + * recursively call kfree. + */ +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct slab *slab, *n; + + list_for_each_entry_safe(slab, n, list, slab_list) { + list_del(&slab->slab_list); + slab_destroy(cachep, slab); + } +} + +/** + * calculate_slab_order - calculate size (page order) of slabs + * @cachep: pointer to the cache that is being created + * @size: size of objects to be created in this cache. + * @flags: slab allocation flags + * + * Also calculates the number of objects per slab. + * + * This could be made much more intelligent. For now, try to avoid using + * high order pages for slabs. When the gfp() functions are more friendly + * towards high-order requests, this should be changed. + * + * Return: number of left-over bytes in a slab + */ +static size_t calculate_slab_order(struct kmem_cache *cachep, + size_t size, slab_flags_t flags) +{ + size_t left_over = 0; + int gfporder; + + for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) { + unsigned int num; + size_t remainder; + + num = cache_estimate(gfporder, size, flags, &remainder); + if (!num) + continue; + + /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */ + if (num > SLAB_OBJ_MAX_NUM) + break; + + if (flags & CFLGS_OFF_SLAB) { + struct kmem_cache *freelist_cache; + size_t freelist_size; + size_t freelist_cache_size; + + freelist_size = num * sizeof(freelist_idx_t); + if (freelist_size > KMALLOC_MAX_CACHE_SIZE) { + freelist_cache_size = PAGE_SIZE << get_order(freelist_size); + } else { + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + freelist_cache_size = freelist_cache->size; + + /* + * Needed to avoid possible looping condition + * in cache_grow_begin() + */ + if (OFF_SLAB(freelist_cache)) + continue; + } + + /* check if off slab has enough benefit */ + if (freelist_cache_size > cachep->size / 2) + continue; + } + + /* Found something acceptable - save it away */ + cachep->num = num; + cachep->gfporder = gfporder; + left_over = remainder; + + /* + * A VFS-reclaimable slab tends to have most allocations + * as GFP_NOFS and we really don't want to have to be allocating + * higher-order pages when we are unable to shrink dcache. + */ + if (flags & SLAB_RECLAIM_ACCOUNT) + break; + + /* + * Large number of objects is good, but very large slabs are + * currently bad for the gfp()s. + */ + if (gfporder >= slab_max_order) + break; + + /* + * Acceptable internal fragmentation? + */ + if (left_over * 8 <= (PAGE_SIZE << gfporder)) + break; + } + return left_over; +} + +static struct array_cache __percpu *alloc_kmem_cache_cpus( + struct kmem_cache *cachep, int entries, int batchcount) +{ + int cpu; + size_t size; + struct array_cache __percpu *cpu_cache; + + size = sizeof(void *) * entries + sizeof(struct array_cache); + cpu_cache = __alloc_percpu(size, sizeof(void *)); + + if (!cpu_cache) + return NULL; + + for_each_possible_cpu(cpu) { + init_arraycache(per_cpu_ptr(cpu_cache, cpu), + entries, batchcount); + } + + return cpu_cache; +} + +static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) +{ + if (slab_state >= FULL) + return enable_cpucache(cachep, gfp); + + cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); + if (!cachep->cpu_cache) + return 1; + + if (slab_state == DOWN) { + /* Creation of first cache (kmem_cache). */ + set_up_node(kmem_cache, CACHE_CACHE); + } else if (slab_state == PARTIAL) { + /* For kmem_cache_node */ + set_up_node(cachep, SIZE_NODE); + } else { + int node; + + for_each_online_node(node) { + cachep->node[node] = kmalloc_node( + sizeof(struct kmem_cache_node), gfp, node); + BUG_ON(!cachep->node[node]); + kmem_cache_node_init(cachep->node[node]); + } + } + + cachep->node[numa_mem_id()]->next_reap = + jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + cpu_cache_get(cachep)->avail = 0; + cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; + cpu_cache_get(cachep)->batchcount = 1; + cpu_cache_get(cachep)->touched = 0; + cachep->batchcount = 1; + cachep->limit = BOOT_CPUCACHE_ENTRIES; + return 0; +} + +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name) +{ + return flags; +} + +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ + struct kmem_cache *cachep; + + cachep = find_mergeable(size, align, flags, name, ctor); + if (cachep) { + cachep->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + cachep->object_size = max_t(int, cachep->object_size, size); + } + return cachep; +} + +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, slab_flags_t flags) +{ + size_t left; + + cachep->num = 0; + + /* + * If slab auto-initialization on free is enabled, store the freelist + * off-slab, so that its contents don't end up in one of the allocated + * objects. + */ + if (unlikely(slab_want_init_on_free(cachep))) + return false; + + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, slab_flags_t flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. + */ + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, slab_flags_t flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +/** + * __kmem_cache_create - Create a cache. + * @cachep: cache management descriptor + * @flags: SLAB flags + * + * Returns a ptr to the cache on success, NULL on failure. + * Cannot be called within an int, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the created cache or %NULL in case of error + */ +int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) +{ + size_t ralign = BYTES_PER_WORD; + gfp_t gfp; + int err; + unsigned int size = cachep->size; + +#if DEBUG +#if FORCED_DEBUG + /* + * Enable redzoning and last user accounting, except for caches with + * large objects, if the increased size would increase the object size + * above the next power of two: caches with object sizes just above a + * power of two have a significant amount of internal fragmentation. + */ + if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + + 2 * sizeof(unsigned long long))) + flags |= SLAB_RED_ZONE | SLAB_STORE_USER; + if (!(flags & SLAB_TYPESAFE_BY_RCU)) + flags |= SLAB_POISON; +#endif +#endif + + /* + * Check that size is in terms of words. This is needed to avoid + * unaligned accesses for some archs when redzoning is used, and makes + * sure any on-slab bufctl's are also correctly aligned. + */ + size = ALIGN(size, BYTES_PER_WORD); + + if (flags & SLAB_RED_ZONE) { + ralign = REDZONE_ALIGN; + /* If redzoning, ensure that the second redzone is suitably + * aligned, by adjusting the object size accordingly. */ + size = ALIGN(size, REDZONE_ALIGN); + } + + /* 3) caller mandated alignment */ + if (ralign < cachep->align) { + ralign = cachep->align; + } + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) + flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); + /* + * 4) Store it. + */ + cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; + + if (slab_is_available()) + gfp = GFP_KERNEL; + else + gfp = GFP_NOWAIT; + +#if DEBUG + + /* + * Both debugging options require word-alignment which is calculated + * into align above. + */ + if (flags & SLAB_RED_ZONE) { + /* add space for red zone words */ + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); + } + if (flags & SLAB_STORE_USER) { + /* user store requires one word storage behind the end of + * the real object. But if the second red zone needs to be + * aligned to 64 bits, we must allow that much space. + */ + if (flags & SLAB_RED_ZONE) + size += REDZONE_ALIGN; + else + size += BYTES_PER_WORD; + } +#endif + + kasan_cache_create(cachep, &size, &flags); + + size = ALIGN(size, cachep->align); + /* + * We should restrict the number of objects in a slab to implement + * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition. + */ + if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) + size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); + +#if DEBUG + /* + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. + */ + if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } + } +#endif + + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } + + if (set_off_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OFF_SLAB; + goto done; + } + + if (set_on_slab_cache(cachep, size, flags)) + goto done; + + return -E2BIG; + +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); + cachep->flags = flags; + cachep->allocflags = __GFP_COMP; + if (flags & SLAB_CACHE_DMA) + cachep->allocflags |= GFP_DMA; + if (flags & SLAB_CACHE_DMA32) + cachep->allocflags |= GFP_DMA32; + if (flags & SLAB_RECLAIM_ACCOUNT) + cachep->allocflags |= __GFP_RECLAIMABLE; + cachep->size = size; + cachep->reciprocal_buffer_size = reciprocal_value(size); + +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + err = setup_cpu_cache(cachep, gfp); + if (err) { + __kmem_cache_release(cachep); + return err; + } + + return 0; +} + +#if DEBUG +static void check_irq_off(void) +{ + BUG_ON(!irqs_disabled()); +} + +static void check_irq_on(void) +{ + BUG_ON(irqs_disabled()); +} + +static void check_mutex_acquired(void) +{ + BUG_ON(!mutex_is_locked(&slab_mutex)); +} + +static void check_spinlock_acquired(struct kmem_cache *cachep) +{ +#ifdef CONFIG_SMP + check_irq_off(); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); +#endif +} + +static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) +{ +#ifdef CONFIG_SMP + check_irq_off(); + assert_spin_locked(&get_node(cachep, node)->list_lock); +#endif +} + +#else +#define check_irq_off() do { } while(0) +#define check_irq_on() do { } while(0) +#define check_mutex_acquired() do { } while(0) +#define check_spinlock_acquired(x) do { } while(0) +#define check_spinlock_acquired_node(x, y) do { } while(0) +#endif + +static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, + int node, bool free_all, struct list_head *list) +{ + int tofree; + + if (!ac || !ac->avail) + return; + + tofree = free_all ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + + free_block(cachep, ac->entry, tofree, node, list); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); +} + +static void do_drain(void *arg) +{ + struct kmem_cache *cachep = arg; + struct array_cache *ac; + int node = numa_mem_id(); + struct kmem_cache_node *n; + LIST_HEAD(list); + + check_irq_off(); + ac = cpu_cache_get(cachep); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock(&n->list_lock); + ac->avail = 0; + slabs_destroy(cachep, &list); +} + +static void drain_cpu_caches(struct kmem_cache *cachep) +{ + struct kmem_cache_node *n; + int node; + LIST_HEAD(list); + + on_each_cpu(do_drain, cachep, 1); + check_irq_on(); + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) + drain_alien_cache(cachep, n->alien); + + for_each_kmem_cache_node(cachep, node, n) { + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } +} + +/* + * Remove slabs from the list of free slabs. + * Specify the number of slabs to drain in tofree. + * + * Returns the actual number of slabs released. + */ +static int drain_freelist(struct kmem_cache *cache, + struct kmem_cache_node *n, int tofree) +{ + struct list_head *p; + int nr_freed; + struct slab *slab; + + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&n->slabs_free)) { + + spin_lock_irq(&n->list_lock); + p = n->slabs_free.prev; + if (p == &n->slabs_free) { + spin_unlock_irq(&n->list_lock); + goto out; + } + + slab = list_entry(p, struct slab, slab_list); + list_del(&slab->slab_list); + n->free_slabs--; + n->total_slabs--; + /* + * Safe to drop the lock. The slab is no longer linked + * to the cache. + */ + n->free_objects -= cache->num; + spin_unlock_irq(&n->list_lock); + slab_destroy(cache, slab); + nr_freed++; + } +out: + return nr_freed; +} + +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) + return false; + return true; +} + +int __kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret = 0; + int node; + struct kmem_cache_node *n; + + drain_cpu_caches(cachep); + + check_irq_on(); + for_each_kmem_cache_node(cachep, node, n) { + drain_freelist(cachep, n, INT_MAX); + + ret += !list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial); + } + return (ret ? 1 : 0); +} + +int __kmem_cache_shutdown(struct kmem_cache *cachep) +{ + return __kmem_cache_shrink(cachep); +} + +void __kmem_cache_release(struct kmem_cache *cachep) +{ + int i; + struct kmem_cache_node *n; + + cache_random_seq_destroy(cachep); + + free_percpu(cachep->cpu_cache); + + /* NUMA: free the node structures */ + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; + } +} + +/* + * Get the memory for a slab management obj. + * + * For a slab cache when the slab descriptor is off-slab, the + * slab descriptor can't come from the same cache which is being created, + * Because if it is the case, that means we defer the creation of + * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. + * And we eventually call down to __kmem_cache_create(), which + * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one. + * This is a "chicken-and-egg" problem. + * + * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, + * which are all initialized during kmem_cache_init(). + */ +static void *alloc_slabmgmt(struct kmem_cache *cachep, + struct slab *slab, int colour_off, + gfp_t local_flags, int nodeid) +{ + void *freelist; + void *addr = slab_address(slab); + + slab->s_mem = addr + colour_off; + slab->active = 0; + + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { + /* Slab management obj is off-slab. */ + freelist = kmalloc_node(cachep->freelist_size, + local_flags, nodeid); + } else { + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; + } + + return freelist; +} + +static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx) +{ + return ((freelist_idx_t *) slab->freelist)[idx]; +} + +static inline void set_free_obj(struct slab *slab, + unsigned int idx, freelist_idx_t val) +{ + ((freelist_idx_t *)(slab->freelist))[idx] = val; +} + +static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab) +{ +#if DEBUG + int i; + + for (i = 0; i < cachep->num; i++) { + void *objp = index_to_obj(cachep, slab, i); + + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = NULL; + + if (cachep->flags & SLAB_RED_ZONE) { + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + /* + * Constructors are not allowed to allocate memory from the same + * cache which they are a constructor for. Otherwise, deadlock. + * They must also be threaded. + */ + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); + cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the end of an object"); + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) + slab_error(cachep, "constructor overwrote the start of an object"); + } + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0); + } + } +#endif +} + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Hold information during a freelist initialization */ +union freelist_init_state { + struct { + unsigned int pos; + unsigned int *list; + unsigned int count; + }; + struct rnd_state rnd_state; +}; + +/* + * Initialize the state based on the randomization method available. + * return true if the pre-computed list is available, false otherwise. + */ +static bool freelist_state_initialize(union freelist_init_state *state, + struct kmem_cache *cachep, + unsigned int count) +{ + bool ret; + unsigned int rand; + + /* Use best entropy available to define a random shift */ + rand = get_random_u32(); + + /* Use a random state if the pre-computed list is not available */ + if (!cachep->random_seq) { + prandom_seed_state(&state->rnd_state, rand); + ret = false; + } else { + state->list = cachep->random_seq; + state->count = count; + state->pos = rand % count; + ret = true; + } + return ret; +} + +/* Get the next entry on the list and randomize it using a random shift */ +static freelist_idx_t next_random_slot(union freelist_init_state *state) +{ + if (state->pos >= state->count) + state->pos = 0; + return state->list[state->pos++]; +} + +/* Swap two freelist entries */ +static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b) +{ + swap(((freelist_idx_t *) slab->freelist)[a], + ((freelist_idx_t *) slab->freelist)[b]); +} + +/* + * Shuffle the freelist initialization state based on pre-computed lists. + * return true if the list was successfully shuffled, false otherwise. + */ +static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab) +{ + unsigned int objfreelist = 0, i, rand, count = cachep->num; + union freelist_init_state state; + bool precomputed; + + if (count < 2) + return false; + + precomputed = freelist_state_initialize(&state, cachep, count); + + /* Take a random entry as the objfreelist */ + if (OBJFREELIST_SLAB(cachep)) { + if (!precomputed) + objfreelist = count - 1; + else + objfreelist = next_random_slot(&state); + slab->freelist = index_to_obj(cachep, slab, objfreelist) + + obj_offset(cachep); + count--; + } + + /* + * On early boot, generate the list dynamically. + * Later use a pre-computed list for speed. + */ + if (!precomputed) { + for (i = 0; i < count; i++) + set_free_obj(slab, i, i); + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(&state.rnd_state); + rand %= (i + 1); + swap_free_obj(slab, i, rand); + } + } else { + for (i = 0; i < count; i++) + set_free_obj(slab, i, next_random_slot(&state)); + } + + if (OBJFREELIST_SLAB(cachep)) + set_free_obj(slab, cachep->num - 1, objfreelist); + + return true; +} +#else +static inline bool shuffle_freelist(struct kmem_cache *cachep, + struct slab *slab) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static void cache_init_objs(struct kmem_cache *cachep, + struct slab *slab) +{ + int i; + void *objp; + bool shuffled; + + cache_init_objs_debug(cachep, slab); + + /* Try to randomize the freelist if enabled */ + shuffled = shuffle_freelist(cachep, slab); + + if (!shuffled && OBJFREELIST_SLAB(cachep)) { + slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) + + obj_offset(cachep); + } + + for (i = 0; i < cachep->num; i++) { + objp = index_to_obj(cachep, slab, i); + objp = kasan_init_slab_obj(cachep, objp); + + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) { + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } + + if (!shuffled) + set_free_obj(slab, i, i); + } +} + +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab) +{ + void *objp; + + objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active)); + slab->active++; + + return objp; +} + +static void slab_put_obj(struct kmem_cache *cachep, + struct slab *slab, void *objp) +{ + unsigned int objnr = obj_to_index(cachep, slab, objp); +#if DEBUG + unsigned int i; + + /* Verify double free bug */ + for (i = slab->active; i < cachep->num; i++) { + if (get_free_obj(slab, i) == objnr) { + pr_err("slab: double free detected in cache '%s', objp %px\n", + cachep->name, objp); + BUG(); + } + } +#endif + slab->active--; + if (!slab->freelist) + slab->freelist = objp + obj_offset(cachep); + + set_free_obj(slab, slab->active, objnr); +} + +/* + * Grow (by 1) the number of slabs within a cache. This is called by + * kmem_cache_alloc() when there are no active objs left in a cache. + */ +static struct slab *cache_grow_begin(struct kmem_cache *cachep, + gfp_t flags, int nodeid) +{ + void *freelist; + size_t offset; + gfp_t local_flags; + int slab_node; + struct kmem_cache_node *n; + struct slab *slab; + + /* + * Be lazy and only check for valid flags here, keeping it out of the + * critical path in kmem_cache_alloc(). + */ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); + local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); + + check_irq_off(); + if (gfpflags_allow_blocking(local_flags)) + local_irq_enable(); + + /* + * Get mem for the objs. Attempt to allocate a physical page from + * 'nodeid'. + */ + slab = kmem_getpages(cachep, local_flags, nodeid); + if (!slab) + goto failed; + + slab_node = slab_nid(slab); + n = get_node(cachep, slab_node); + + /* Get colour for the slab, and cal the next value. */ + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + + offset = n->colour_next; + if (offset >= cachep->colour) + offset = 0; + + offset *= cachep->colour_off; + + /* + * Call kasan_poison_slab() before calling alloc_slabmgmt(), so + * page_address() in the latter returns a non-tagged pointer, + * as it should be for slab pages. + */ + kasan_poison_slab(slab); + + /* Get slab management. */ + freelist = alloc_slabmgmt(cachep, slab, offset, + local_flags & ~GFP_CONSTRAINT_MASK, slab_node); + if (OFF_SLAB(cachep) && !freelist) + goto opps1; + + slab->slab_cache = cachep; + slab->freelist = freelist; + + cache_init_objs(cachep, slab); + + if (gfpflags_allow_blocking(local_flags)) + local_irq_disable(); + + return slab; + +opps1: + kmem_freepages(cachep, slab); +failed: + if (gfpflags_allow_blocking(local_flags)) + local_irq_disable(); + return NULL; +} + +static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) +{ + struct kmem_cache_node *n; + void *list = NULL; + + check_irq_off(); + + if (!slab) + return; + + INIT_LIST_HEAD(&slab->slab_list); + n = get_node(cachep, slab_nid(slab)); + + spin_lock(&n->list_lock); + n->total_slabs++; + if (!slab->active) { + list_add_tail(&slab->slab_list, &n->slabs_free); + n->free_slabs++; + } else + fixup_slab_list(cachep, n, slab, &list); + + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - slab->active; + spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); +} + +#if DEBUG + +/* + * Perform extra freeing checks: + * - detect bad pointers. + * - POISON/RED_ZONE checking + */ +static void kfree_debugcheck(const void *objp) +{ + if (!virt_addr_valid(objp)) { + pr_err("kfree_debugcheck: out of range ptr %lxh\n", + (unsigned long)objp); + BUG(); + } +} + +static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) +{ + unsigned long long redzone1, redzone2; + + redzone1 = *dbg_redzone1(cache, obj); + redzone2 = *dbg_redzone2(cache, obj); + + /* + * Redzone is ok. + */ + if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) + return; + + if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) + slab_error(cache, "double free detected"); + else + slab_error(cache, "memory outside object was overwritten"); + + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", + obj, redzone1, redzone2); +} + +static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + unsigned int objnr; + struct slab *slab; + + BUG_ON(virt_to_cache(objp) != cachep); + + objp -= obj_offset(cachep); + kfree_debugcheck(objp); + slab = virt_to_slab(objp); + + if (cachep->flags & SLAB_RED_ZONE) { + verify_redzone_free(cachep, objp); + *dbg_redzone1(cachep, objp) = RED_INACTIVE; + *dbg_redzone2(cachep, objp) = RED_INACTIVE; + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = (void *)caller; + + objnr = obj_to_index(cachep, slab, objp); + + BUG_ON(objnr >= cachep->num); + BUG_ON(objp != index_to_obj(cachep, slab, objnr)); + + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0); + } + return objp; +} + +#else +#define kfree_debugcheck(x) do { } while(0) +#define cache_free_debugcheck(x, objp, z) (objp) +#endif + +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct slab *slab, + void **list) +{ + /* move slabp to correct slabp list: */ + list_del(&slab->slab_list); + if (slab->active == cachep->num) { + list_add(&slab->slab_list, &n->slabs_full); + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = slab->freelist; + + *objp = *list; + *list = objp; + } +#endif + slab->freelist = NULL; + } + } else + list_add(&slab->slab_list, &n->slabs_partial); +} + +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n, + struct slab *slab, bool pfmemalloc) +{ + if (!slab) + return NULL; + + if (pfmemalloc) + return slab; + + if (!slab_test_pfmemalloc(slab)) + return slab; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + slab_clear_pfmemalloc(slab); + return slab; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&slab->slab_list); + if (!slab->active) { + list_add_tail(&slab->slab_list, &n->slabs_free); + n->free_slabs++; + } else + list_add_tail(&slab->slab_list, &n->slabs_partial); + + list_for_each_entry(slab, &n->slabs_partial, slab_list) { + if (!slab_test_pfmemalloc(slab)) + return slab; + } + + n->free_touched = 1; + list_for_each_entry(slab, &n->slabs_free, slab_list) { + if (!slab_test_pfmemalloc(slab)) { + n->free_slabs--; + return slab; + } + } + + return NULL; +} + +static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) +{ + struct slab *slab; + + assert_spin_locked(&n->list_lock); + slab = list_first_entry_or_null(&n->slabs_partial, struct slab, + slab_list); + if (!slab) { + n->free_touched = 1; + slab = list_first_entry_or_null(&n->slabs_free, struct slab, + slab_list); + if (slab) + n->free_slabs--; + } + + if (sk_memalloc_socks()) + slab = get_valid_first_slab(n, slab, pfmemalloc); + + return slab; +} + +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct slab *slab; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + slab = get_first_slab(n, true); + if (!slab) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, slab); + n->free_objects--; + + fixup_slab_list(cachep, n, slab, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +/* + * Slab list should be fixed up by fixup_slab_list() for existing slab + * or cache_grow_end() for new slab + */ +static __always_inline int alloc_block(struct kmem_cache *cachep, + struct array_cache *ac, struct slab *slab, int batchcount) +{ + /* + * There must be at least one object available for + * allocation. + */ + BUG_ON(slab->active >= cachep->num); + + while (slab->active < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac->entry[ac->avail++] = slab_get_obj(cachep, slab); + } + + return batchcount; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +{ + int batchcount; + struct kmem_cache_node *n; + struct array_cache *ac, *shared; + int node; + void *list = NULL; + struct slab *slab; + + check_irq_off(); + node = numa_mem_id(); + + ac = cpu_cache_get(cachep); + batchcount = ac->batchcount; + if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { + /* + * If there was little recent activity on this cache, then + * perform only a partial refill. Otherwise we could generate + * refill bouncing. + */ + batchcount = BATCHREFILL_LIMIT; + } + n = get_node(cachep, node); + + BUG_ON(ac->avail > 0 || !n); + shared = READ_ONCE(n->shared); + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + + spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); + + /* See if we can refill from the shared array */ + if (shared && transfer_objects(ac, shared, batchcount)) { + shared->touched = 1; + goto alloc_done; + } + + while (batchcount > 0) { + /* Get slab alloc is to come from. */ + slab = get_first_slab(n, false); + if (!slab) + goto must_grow; + + check_spinlock_acquired(cachep); + + batchcount = alloc_block(cachep, ac, slab, batchcount); + fixup_slab_list(cachep, n, slab, &list); + } + +must_grow: + n->free_objects -= ac->avail; +alloc_done: + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + +direct_grow: + if (unlikely(!ac->avail)) { + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + + slab = cache_grow_begin(cachep, gfp_exact_node(flags), node); + + /* + * cache_grow_begin() can reenable interrupts, + * then ac could change. + */ + ac = cpu_cache_get(cachep); + if (!ac->avail && slab) + alloc_block(cachep, ac, slab, batchcount); + cache_grow_end(cachep, slab); + + if (!ac->avail) + return NULL; + } + ac->touched = 1; + + return ac->entry[--ac->avail]; +} + +#if DEBUG +static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, + gfp_t flags, void *objp, unsigned long caller) +{ + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); + if (!objp || is_kfence_address(objp)) + return objp; + if (cachep->flags & SLAB_POISON) { + check_poison_obj(cachep, objp); + slab_kernel_map(cachep, objp, 1); + poison_obj(cachep, objp, POISON_INUSE); + } + if (cachep->flags & SLAB_STORE_USER) + *dbg_userword(cachep, objp) = (void *)caller; + + if (cachep->flags & SLAB_RED_ZONE) { + if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || + *dbg_redzone2(cachep, objp) != RED_INACTIVE) { + slab_error(cachep, "double free, or memory outside object was overwritten"); + pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); + } + *dbg_redzone1(cachep, objp) = RED_ACTIVE; + *dbg_redzone2(cachep, objp) = RED_ACTIVE; + } + + objp += obj_offset(cachep); + if (cachep->ctor && cachep->flags & SLAB_POISON) + cachep->ctor(objp); + if ((unsigned long)objp & (arch_slab_minalign() - 1)) { + pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp, + arch_slab_minalign()); + } + return objp; +} +#else +#define cache_alloc_debugcheck_after(a, b, objp, d) (objp) +#endif + +static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + void *objp; + struct array_cache *ac; + + check_irq_off(); + + ac = cpu_cache_get(cachep); + if (likely(ac->avail)) { + ac->touched = 1; + objp = ac->entry[--ac->avail]; + + STATS_INC_ALLOCHIT(cachep); + goto out; + } + + STATS_INC_ALLOCMISS(cachep); + objp = cache_alloc_refill(cachep, flags); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); + +out: + /* + * To avoid a false negative, if an object that is in one of the + * per-CPU caches is leaked, we need to make sure kmemleak doesn't + * treat the array pointers as a reference to the object. + */ + if (objp) + kmemleak_erase(&ac->entry[ac->avail]); + return objp; +} + +#ifdef CONFIG_NUMA +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); + +/* + * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. + * + * If we are in_interrupt, then process context, including cpusets and + * mempolicy, may not apply and should not be used for allocation policy. + */ +static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + int nid_alloc, nid_here; + + if (in_interrupt() || (flags & __GFP_THISNODE)) + return NULL; + nid_alloc = nid_here = numa_mem_id(); + if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) + nid_alloc = cpuset_slab_spread_node(); + else if (current->mempolicy) + nid_alloc = mempolicy_slab_node(); + if (nid_alloc != nid_here) + return ____cache_alloc_node(cachep, flags, nid_alloc); + return NULL; +} + +/* + * Fallback function if there was no memory available and no objects on a + * certain node and fall back is permitted. First we scan all the + * available node for available objects. If that fails then we + * perform an allocation without specifying a node. This allows the page + * allocator to do its reclaim / fallback magic. We then insert the + * slab into the proper nodelist and then allocate from it. + */ +static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +{ + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(flags); + void *obj = NULL; + struct slab *slab; + int nid; + unsigned int cpuset_mems_cookie; + + if (flags & __GFP_THISNODE) + return NULL; + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); + +retry: + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { + nid = zone_to_nid(zone); + + if (cpuset_zone_allowed(zone, flags) && + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); + if (obj) + break; + } + } + + if (!obj) { + /* + * This allocation will be performed within the constraints + * of the current cpuset / memory policy requirements. + * We may trigger various forms of reclaim on the allowed + * set and go into memory reserves if necessary. + */ + slab = cache_grow_begin(cache, flags, numa_mem_id()); + cache_grow_end(cache, slab); + if (slab) { + nid = slab_nid(slab); + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); + + /* + * Another processor may allocate the objects in + * the slab since we are not holding any locks. + */ + if (!obj) + goto retry; + } + } + + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return obj; +} + +/* + * An interface to enable slab creation on nodeid + */ +static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid) +{ + struct slab *slab; + struct kmem_cache_node *n; + void *obj = NULL; + void *list = NULL; + + VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); + n = get_node(cachep, nodeid); + BUG_ON(!n); + + check_irq_off(); + spin_lock(&n->list_lock); + slab = get_first_slab(n, false); + if (!slab) + goto must_grow; + + check_spinlock_acquired_node(cachep, nodeid); + + STATS_INC_NODEALLOCS(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + BUG_ON(slab->active == cachep->num); + + obj = slab_get_obj(cachep, slab); + n->free_objects--; + + fixup_slab_list(cachep, n, slab, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + return obj; + +must_grow: + spin_unlock(&n->list_lock); + slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (slab) { + /* This slab isn't counted yet so don't update free_objects */ + obj = slab_get_obj(cachep, slab); + } + cache_grow_end(cachep, slab); + + return obj ? obj : fallback_alloc(cachep, flags); +} + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) +{ + void *objp = NULL; + int slab_node = numa_mem_id(); + + if (nodeid == NUMA_NO_NODE) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cachep, flags); + if (objp) + goto out; + } + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + objp = ____cache_alloc(cachep, flags); + nodeid = slab_node; + } else if (nodeid == slab_node) { + objp = ____cache_alloc(cachep, flags); + } else if (!get_node(cachep, nodeid)) { + /* Node not bootstrapped yet */ + objp = fallback_alloc(cachep, flags); + goto out; + } + + /* + * We may just have run out of memory on the local node. + * ____cache_alloc_node() knows how to locate memory on other nodes + */ + if (!objp) + objp = ____cache_alloc_node(cachep, flags, nodeid); +out: + return objp; +} +#else + +static __always_inline void * +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) +{ + return ____cache_alloc(cachep, flags); +} + +#endif /* CONFIG_NUMA */ + +static __always_inline void * +slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + int nodeid, size_t orig_size, unsigned long caller) +{ + unsigned long save_flags; + void *objp; + struct obj_cgroup *objcg = NULL; + bool init = false; + + flags &= gfp_allowed_mask; + cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags); + if (unlikely(!cachep)) + return NULL; + + objp = kfence_alloc(cachep, orig_size, flags); + if (unlikely(objp)) + goto out; + + local_irq_save(save_flags); + objp = __do_cache_alloc(cachep, flags, nodeid); + local_irq_restore(save_flags); + objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); + prefetchw(objp); + init = slab_want_init_on_alloc(flags, cachep); + +out: + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); + return objp; +} + +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) +{ + return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, + caller); +} + +/* + * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller + */ +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) +{ + int i; + struct kmem_cache_node *n = get_node(cachep, node); + struct slab *slab; + + n->free_objects += nr_objects; + + for (i = 0; i < nr_objects; i++) { + void *objp; + struct slab *slab; + + objp = objpp[i]; + + slab = virt_to_slab(objp); + list_del(&slab->slab_list); + check_spinlock_acquired_node(cachep, node); + slab_put_obj(cachep, slab, objp); + STATS_DEC_ACTIVE(cachep); + + /* fixup slab chains */ + if (slab->active == 0) { + list_add(&slab->slab_list, &n->slabs_free); + n->free_slabs++; + } else { + /* Unconditionally move a slab to the end of the + * partial list on free - maximum time for the + * other objects to be freed, too. + */ + list_add_tail(&slab->slab_list, &n->slabs_partial); + } + } + + while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { + n->free_objects -= cachep->num; + + slab = list_last_entry(&n->slabs_free, struct slab, slab_list); + list_move(&slab->slab_list, list); + n->free_slabs--; + n->total_slabs--; + } +} + +static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) +{ + int batchcount; + struct kmem_cache_node *n; + int node = numa_mem_id(); + LIST_HEAD(list); + + batchcount = ac->batchcount; + + check_irq_off(); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + if (n->shared) { + struct array_cache *shared_array = n->shared; + int max = shared_array->limit - shared_array->avail; + if (max) { + if (batchcount > max) + batchcount = max; + memcpy(&(shared_array->entry[shared_array->avail]), + ac->entry, sizeof(void *) * batchcount); + shared_array->avail += batchcount; + goto free_done; + } + } + + free_block(cachep, ac->entry, batchcount, node, &list); +free_done: +#if STATS + { + int i = 0; + struct slab *slab; + + list_for_each_entry(slab, &n->slabs_free, slab_list) { + BUG_ON(slab->active); + + i++; + } + STATS_SET_FREEABLE(cachep, i); + } +#endif + spin_unlock(&n->list_lock); + ac->avail -= batchcount; + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); + slabs_destroy(cachep, &list); +} + +/* + * Release an obj back to its cache. If the obj has a constructed state, it must + * be in this state _before_ it is released. Called with disabled ints. + */ +static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + bool init; + + memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); + + if (is_kfence_address(objp)) { + kmemleak_free_recursive(objp, cachep->flags); + __kfence_free(objp); + return; + } + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset must be + * kept together to avoid discrepancies in behavior. + */ + init = slab_want_init_on_free(cachep); + if (init && !kasan_has_integrated_init()) + memset(objp, 0, cachep->object_size); + /* KASAN might put objp into memory quarantine, delaying its reuse. */ + if (kasan_slab_free(cachep, objp, init)) + return; + + /* Use KCSAN to help debug racy use-after-free. */ + if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU)) + __kcsan_check_access(objp, cachep->object_size, + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + + ___cache_free(cachep, objp, caller); +} + +void ___cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + struct array_cache *ac = cpu_cache_get(cachep); + + check_irq_off(); + kmemleak_free_recursive(objp, cachep->flags); + objp = cache_free_debugcheck(cachep, objp, caller); + + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) + return; + + if (ac->avail < ac->limit) { + STATS_INC_FREEHIT(cachep); + } else { + STATS_INC_FREEMISS(cachep); + cache_flusharray(cachep, ac); + } + + if (sk_memalloc_socks()) { + struct slab *slab = virt_to_slab(objp); + + if (unlikely(slab_test_pfmemalloc(slab))) { + cache_free_pfmemalloc(cachep, slab, objp); + return; + } + } + + __free_one(ac, objp); +} + +static __always_inline +void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + gfp_t flags) +{ + void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); + + return ret; +} + +/** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * + * Allocate an object from this cache. The flags are only relevant + * if the cache has no available objects. + * + * Return: pointer to the new object or %NULL in case of error + */ +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return __kmem_cache_alloc_lru(cachep, NULL, flags); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + gfp_t flags) +{ + return __kmem_cache_alloc_lru(cachep, lru, flags); +} +EXPORT_SYMBOL(kmem_cache_alloc_lru); + +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) +{ + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); +} + +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + size_t i; + struct obj_cgroup *objcg = NULL; + + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + if (!s) + return 0; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = kfence_alloc(s, s->object_size, flags) ?: + __do_cache_alloc(s, flags, NUMA_NO_NODE); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled section. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); + slab_post_alloc_hook(s, objcg, flags, i, p, false); + kmem_cache_free_bulk(s, i, p); + return 0; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error + */ +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +{ + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); + + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, + unsigned long caller) +{ + return slab_alloc_node(cachep, NULL, flags, nodeid, + orig_size, caller); +} + +#ifdef CONFIG_PRINTK +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + struct kmem_cache *cachep; + unsigned int objnr; + void *objp; + + kpp->kp_ptr = object; + kpp->kp_slab = slab; + cachep = slab->slab_cache; + kpp->kp_slab_cache = cachep; + objp = object - obj_offset(cachep); + kpp->kp_data_offset = obj_offset(cachep); + slab = virt_to_slab(objp); + objnr = obj_to_index(cachep, slab, objp); + objp = index_to_obj(cachep, slab, objnr); + kpp->kp_objp = objp; + if (DEBUG && cachep->flags & SLAB_STORE_USER) + kpp->kp_ret = *dbg_userword(cachep, objp); +} +#endif + +static __always_inline +void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + unsigned long flags; + + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, caller); + local_irq_restore(flags); +} + +void __kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + __do_kmem_cache_free(cachep, objp, caller); +} + +/** + * kmem_cache_free - Deallocate an object + * @cachep: The cache the allocation was from. + * @objp: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ +void kmem_cache_free(struct kmem_cache *cachep, void *objp) +{ + cachep = cache_from_obj(cachep, objp); + if (!cachep) + return; + + trace_kmem_cache_free(_RET_IP_, objp, cachep); + __do_kmem_cache_free(cachep, objp, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_free); + +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + + local_irq_disable(); + for (int i = 0; i < size; i++) { + void *objp = p[i]; + struct kmem_cache *s; + + if (!orig_s) { + struct folio *folio = virt_to_folio(objp); + + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else { + s = cache_from_obj(orig_s, objp); + } + + if (!s) + continue; + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* + * This initializes kmem_cache_node or resizes various caches for all nodes. + */ +static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) +{ + int ret; + int node; + struct kmem_cache_node *n; + + for_each_online_node(node) { + ret = setup_kmem_cache_node(cachep, node, gfp, true); + if (ret) + goto fail; + + } + + return 0; + +fail: + if (!cachep->list.next) { + /* Cache is not active yet. Roll back what we did */ + node--; + while (node >= 0) { + n = get_node(cachep, node); + if (n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[node] = NULL; + } + node--; + } + } + return -ENOMEM; +} + +/* Always called with the slab_mutex held */ +static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + int batchcount, int shared, gfp_t gfp) +{ + struct array_cache __percpu *cpu_cache, *prev; + int cpu; + + cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); + if (!cpu_cache) + return -ENOMEM; + + prev = cachep->cpu_cache; + cachep->cpu_cache = cpu_cache; + /* + * Without a previous cpu_cache there's no need to synchronize remote + * cpus, so skip the IPIs. + */ + if (prev) + kick_all_cpus_sync(); + + check_irq_on(); + cachep->batchcount = batchcount; + cachep->limit = limit; + cachep->shared = shared; + + if (!prev) + goto setup_node; + + for_each_online_cpu(cpu) { + LIST_HEAD(list); + int node; + struct kmem_cache_node *n; + struct array_cache *ac = per_cpu_ptr(prev, cpu); + + node = cpu_to_mem(cpu); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + } + free_percpu(prev); + +setup_node: + return setup_kmem_cache_nodes(cachep, gfp); +} + +/* Called with slab_mutex held always */ +static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) +{ + int err; + int limit = 0; + int shared = 0; + int batchcount = 0; + + err = cache_random_seq_create(cachep, cachep->num, gfp); + if (err) + goto end; + + /* + * The head array serves three purposes: + * - create a LIFO ordering, i.e. return objects that are cache-warm + * - reduce the number of spinlock operations. + * - reduce the number of linked list operations on the slab and + * bufctl chains: array operations are cheaper. + * The numbers are guessed, we should auto-tune as described by + * Bonwick. + */ + if (cachep->size > 131072) + limit = 1; + else if (cachep->size > PAGE_SIZE) + limit = 8; + else if (cachep->size > 1024) + limit = 24; + else if (cachep->size > 256) + limit = 54; + else + limit = 120; + + /* + * CPU bound tasks (e.g. network routing) can exhibit cpu bound + * allocation behaviour: Most allocs on one cpu, most free operations + * on another cpu. For these cases, an efficient object passing between + * cpus is necessary. This is provided by a shared array. The array + * replaces Bonwick's magazine layer. + * On uniprocessor, it's functionally equivalent (but less efficient) + * to a larger limit. Thus disabled by default. + */ + shared = 0; + if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1) + shared = 8; + +#if DEBUG + /* + * With debugging enabled, large batchcount lead to excessively long + * periods with disabled local interrupts. Limit the batchcount + */ + if (limit > 32) + limit = 32; +#endif + batchcount = (limit + 1) / 2; + err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); +end: + if (err) + pr_err("enable_cpucache failed for %s, error %d\n", + cachep->name, -err); + return err; +} + +/* + * Drain an array if it contains any elements taking the node lock only if + * necessary. Note that the node listlock also protects the array_cache + * if drain_array() is used on the shared array. + */ +static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, + struct array_cache *ac, int node) +{ + LIST_HEAD(list); + + /* ac from n->shared can be freed if we don't hold the slab_mutex. */ + check_mutex_acquired(); + + if (!ac || !ac->avail) + return; + + if (ac->touched) { + ac->touched = 0; + return; + } + + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); +} + +/** + * cache_reap - Reclaim memory from caches. + * @w: work descriptor + * + * Called from workqueue/eventd every few seconds. + * Purpose: + * - clear the per-cpu caches for this CPU. + * - return freeable pages to the main free memory pool. + * + * If we cannot acquire the cache chain mutex then just give up - we'll try + * again on the next iteration. + */ +static void cache_reap(struct work_struct *w) +{ + struct kmem_cache *searchp; + struct kmem_cache_node *n; + int node = numa_mem_id(); + struct delayed_work *work = to_delayed_work(w); + + if (!mutex_trylock(&slab_mutex)) + /* Give up. Setup the next iteration. */ + goto out; + + list_for_each_entry(searchp, &slab_caches, list) { + check_irq_on(); + + /* + * We only take the node lock if absolutely necessary and we + * have established with reasonable certainty that + * we can do some work if the lock was obtained. + */ + n = get_node(searchp, node); + + reap_alien(searchp, n); + + drain_array(searchp, n, cpu_cache_get(searchp), node); + + /* + * These are racy checks but it does not matter + * if we skip one check or scan twice. + */ + if (time_after(n->next_reap, jiffies)) + goto next; + + n->next_reap = jiffies + REAPTIMEOUT_NODE; + + drain_array(searchp, n, n->shared, node); + + if (n->free_touched) + n->free_touched = 0; + else { + int freed; + + freed = drain_freelist(searchp, n, (n->free_limit + + 5 * searchp->num - 1) / (5 * searchp->num)); + STATS_ADD_REAPED(searchp, freed); + } +next: + cond_resched(); + } + check_irq_on(); + mutex_unlock(&slab_mutex); + next_reap_node(); +out: + /* Set up the next iteration */ + schedule_delayed_work_on(smp_processor_id(), work, + round_jiffies_relative(REAPTIMEOUT_AC)); +} + +void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) +{ + unsigned long active_objs, num_objs, active_slabs; + unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long free_slabs = 0; + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(cachep, node, n) { + check_irq_on(); + spin_lock_irq(&n->list_lock); + + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; + free_objs += n->free_objects; + + if (n->shared) + shared_avail += n->shared->avail; + + spin_unlock_irq(&n->list_lock); + } + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; + active_objs = num_objs - free_objs; + + sinfo->active_objs = active_objs; + sinfo->num_objs = num_objs; + sinfo->active_slabs = active_slabs; + sinfo->num_slabs = total_slabs; + sinfo->shared_avail = shared_avail; + sinfo->limit = cachep->limit; + sinfo->batchcount = cachep->batchcount; + sinfo->shared = cachep->shared; + sinfo->objects_per_slab = cachep->num; + sinfo->cache_order = cachep->gfporder; +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) +{ +#if STATS + { /* node stats */ + unsigned long high = cachep->high_mark; + unsigned long allocs = cachep->num_allocations; + unsigned long grown = cachep->grown; + unsigned long reaped = cachep->reaped; + unsigned long errors = cachep->errors; + unsigned long max_freeable = cachep->max_freeable; + unsigned long node_allocs = cachep->node_allocs; + unsigned long node_frees = cachep->node_frees; + unsigned long overflows = cachep->node_overflow; + + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); + } + /* cpu stats */ + { + unsigned long allochit = atomic_read(&cachep->allochit); + unsigned long allocmiss = atomic_read(&cachep->allocmiss); + unsigned long freehit = atomic_read(&cachep->freehit); + unsigned long freemiss = atomic_read(&cachep->freemiss); + + seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", + allochit, allocmiss, freehit, freemiss); + } +#endif +} + +#define MAX_SLABINFO_WRITE 128 +/** + * slabinfo_write - Tuning for the slab allocator + * @file: unused + * @buffer: user buffer + * @count: data length + * @ppos: unused + * + * Return: %0 on success, negative error code otherwise. + */ +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; + int limit, batchcount, shared, res; + struct kmem_cache *cachep; + + if (count > MAX_SLABINFO_WRITE) + return -EINVAL; + if (copy_from_user(&kbuf, buffer, count)) + return -EFAULT; + kbuf[MAX_SLABINFO_WRITE] = '\0'; + + tmp = strchr(kbuf, ' '); + if (!tmp) + return -EINVAL; + *tmp = '\0'; + tmp++; + if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) + return -EINVAL; + + /* Find the cache in the chain of caches. */ + mutex_lock(&slab_mutex); + res = -EINVAL; + list_for_each_entry(cachep, &slab_caches, list) { + if (!strcmp(cachep->name, kbuf)) { + if (limit < 1 || batchcount < 1 || + batchcount > limit || shared < 0) { + res = 0; + } else { + res = do_tune_cpucache(cachep, limit, + batchcount, shared, + GFP_KERNEL); + } + break; + } + } + mutex_unlock(&slab_mutex); + if (res >= 0) + res = count; + return res; +} + +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects incorrectly sized objects and objects that are to be copied + * to/from userspace but do not fall entirely within the containing slab + * cache's usercopy region. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) +{ + struct kmem_cache *cachep; + unsigned int objnr; + unsigned long offset; + + ptr = kasan_reset_tag(ptr); + + /* Find and validate object. */ + cachep = slab->slab_cache; + objnr = obj_to_index(cachep, slab, (void *)ptr); + BUG_ON(objnr >= cachep->num); + + /* Find offset within object. */ + if (is_kfence_address(ptr)) + offset = ptr - kfence_object_start(ptr); + else + offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep); + + /* Allow address range falling entirely within usercopy region. */ + if (offset >= cachep->useroffset && + offset - cachep->useroffset <= cachep->usersize && + n <= cachep->useroffset - offset + cachep->usersize) + return; + + usercopy_abort("SLAB object", cachep->name, to_user, offset, n); +} +#endif /* CONFIG_HARDENED_USERCOPY */ diff --git a/mm/slab.h b/mm/slab.h new file mode 100644 index 000000000..0202a8c2f --- /dev/null +++ b/mm/slab.h @@ -0,0 +1,874 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MM_SLAB_H +#define MM_SLAB_H +/* + * Internal slab definitions + */ + +/* Reuses the bits in struct page */ +struct slab { + unsigned long __page_flags; + +#if defined(CONFIG_SLAB) + + union { + struct list_head slab_list; + struct rcu_head rcu_head; + }; + struct kmem_cache *slab_cache; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + unsigned int active; + +#elif defined(CONFIG_SLUB) + + union { + struct list_head slab_list; + struct rcu_head rcu_head; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + struct kmem_cache *slab_cache; + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + unsigned int __unused; + +#elif defined(CONFIG_SLOB) + + struct list_head slab_list; + void *__unused_1; + void *freelist; /* first free block */ + long units; + unsigned int __unused_2; + +#else +#error "Unexpected slab allocator configured" +#endif + + atomic_t __page_refcount; +#ifdef CONFIG_MEMCG + unsigned long memcg_data; +#endif +}; + +#define SLAB_MATCH(pg, sl) \ + static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) +SLAB_MATCH(flags, __page_flags); +SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ +#ifndef CONFIG_SLOB +SLAB_MATCH(rcu_head, rcu_head); +#endif +SLAB_MATCH(_refcount, __page_refcount); +#ifdef CONFIG_MEMCG +SLAB_MATCH(memcg_data, memcg_data); +#endif +#undef SLAB_MATCH +static_assert(sizeof(struct slab) <= sizeof(struct page)); + +/** + * folio_slab - Converts from folio to slab. + * @folio: The folio. + * + * Currently struct slab is a different representation of a folio where + * folio_test_slab() is true. + * + * Return: The slab which contains this folio. + */ +#define folio_slab(folio) (_Generic((folio), \ + const struct folio *: (const struct slab *)(folio), \ + struct folio *: (struct slab *)(folio))) + +/** + * slab_folio - The folio allocated for a slab + * @slab: The slab. + * + * Slabs are allocated as folios that contain the individual objects and are + * using some fields in the first struct page of the folio - those fields are + * now accessed by struct slab. It is occasionally necessary to convert back to + * a folio in order to communicate with the rest of the mm. Please use this + * helper function instead of casting yourself, as the implementation may change + * in the future. + */ +#define slab_folio(s) (_Generic((s), \ + const struct slab *: (const struct folio *)s, \ + struct slab *: (struct folio *)s)) + +/** + * page_slab - Converts from first struct page to slab. + * @p: The first (either head of compound or single) page of slab. + * + * A temporary wrapper to convert struct page to struct slab in situations where + * we know the page is the compound head, or single order-0 page. + * + * Long-term ideally everything would work with struct slab directly or go + * through folio to struct slab. + * + * Return: The slab which contains this page + */ +#define page_slab(p) (_Generic((p), \ + const struct page *: (const struct slab *)(p), \ + struct page *: (struct slab *)(p))) + +/** + * slab_page - The first struct page allocated for a slab + * @slab: The slab. + * + * A convenience wrapper for converting slab to the first struct page of the + * underlying folio, to communicate with code not yet converted to folio or + * struct slab. + */ +#define slab_page(s) folio_page(slab_folio(s), 0) + +/* + * If network-based swap is enabled, sl*b must keep track of whether pages + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return folio_test_active((struct folio *)slab_folio(slab)); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + folio_set_active(slab_folio(slab)); +} + +static inline void slab_clear_pfmemalloc(struct slab *slab) +{ + folio_clear_active(slab_folio(slab)); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __folio_clear_active(slab_folio(slab)); +} + +static inline void *slab_address(const struct slab *slab) +{ + return folio_address(slab_folio(slab)); +} + +static inline int slab_nid(const struct slab *slab) +{ + return folio_nid(slab_folio(slab)); +} + +static inline pg_data_t *slab_pgdat(const struct slab *slab) +{ + return folio_pgdat(slab_folio(slab)); +} + +static inline struct slab *virt_to_slab(const void *addr) +{ + struct folio *folio = virt_to_folio(addr); + + if (!folio_test_slab(folio)) + return NULL; + + return folio_slab(folio); +} + +static inline int slab_order(const struct slab *slab) +{ + return folio_order((struct folio *)slab_folio(slab)); +} + +static inline size_t slab_size(const struct slab *slab) +{ + return PAGE_SIZE << slab_order(slab); +} + +#ifdef CONFIG_SLOB +/* + * Common fields provided in kmem_cache by all slab allocators + * This struct is either used directly by the allocator (SLOB) + * or the allocator must include definitions for all fields + * provided in kmem_cache_common in their definition of kmem_cache. + * + * Once we can do anonymous structs (C11 standard) we could put a + * anonymous struct definition in these allocators so that the + * separate allocations in the kmem_cache structure of SLAB and + * SLUB is no longer needed. + */ +struct kmem_cache { + unsigned int object_size;/* The original size of the object */ + unsigned int size; /* The aligned/padded/added on size */ + unsigned int align; /* Alignment as calculated */ + slab_flags_t flags; /* Active flags on the slab */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ + const char *name; /* Slab name for sysfs */ + int refcount; /* Use counter */ + void (*ctor)(void *); /* Called on object slot creation */ + struct list_head list; /* List of all slab caches on the system */ +}; + +#endif /* CONFIG_SLOB */ + +#ifdef CONFIG_SLAB +#include +#endif + +#ifdef CONFIG_SLUB +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +/* + * State of the slab allocator. + * + * This is used to describe the states of the allocator during bootup. + * Allocators use this to gradually bootstrap themselves. Most allocators + * have the problem that the structures used for managing slab caches are + * allocated from slab caches themselves. + */ +enum slab_state { + DOWN, /* No slab functionality yet */ + PARTIAL, /* SLUB: kmem_cache_node available */ + PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ + UP, /* Slab caches usable but not all extras yet */ + FULL /* Everything is working */ +}; + +extern enum slab_state slab_state; + +/* The slab cache mutex protects the management structures during changes */ +extern struct mutex slab_mutex; + +/* The list of all slab caches on the system */ +extern struct list_head slab_caches; + +/* The slab cache that manages slab cache information */ +extern struct kmem_cache *kmem_cache; + +/* A table of kmalloc cache names and sizes */ +extern const struct kmalloc_info_struct { + const char *name[NR_KMALLOC_TYPES]; + unsigned int size; +} kmalloc_info[]; + +#ifndef CONFIG_SLOB +/* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); +void create_kmalloc_caches(slab_flags_t); + +/* Find the kmalloc slab corresponding for a certain size */ +struct kmem_cache *kmalloc_slab(size_t, gfp_t); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); +#endif + +gfp_t kmalloc_fix_flags(gfp_t flags); + +/* Functions provided by the slab allocators */ +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); + +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); +extern void create_boot_cache(struct kmem_cache *, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); + +int slab_unmergeable(struct kmem_cache *s); +struct kmem_cache *find_mergeable(unsigned size, unsigned align, + slab_flags_t flags, const char *name, void (*ctor)(void *)); +#ifndef CONFIG_SLOB +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)); + +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name); +#else +static inline struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ return NULL; } + +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name) +{ + return flags; +} +#endif + + +/* Legal flag mask for kmem_cache_create(), for various configurations */ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) + +#if defined(CONFIG_DEBUG_SLAB) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#elif defined(CONFIG_SLUB_DEBUG) +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) +#else +#define SLAB_DEBUG_FLAGS (0) +#endif + +#if defined(CONFIG_SLAB) +#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_ACCOUNT) +#elif defined(CONFIG_SLUB) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS) +#else +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) +#endif + +/* Common flags available with current configuration */ +#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) + +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS) + +bool __kmem_cache_empty(struct kmem_cache *); +int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); + +struct seq_file; +struct file; + +struct slabinfo { + unsigned long active_objs; + unsigned long num_objs; + unsigned long active_slabs; + unsigned long num_slabs; + unsigned long shared_avail; + unsigned int limit; + unsigned int batchcount; + unsigned int shared; + unsigned int objects_per_slab; + unsigned int cache_order; +}; + +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos); + +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +{ + return (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; +} + +#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG_ON +DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); +#else +DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +extern void print_tracking(struct kmem_cache *s, void *object); +long validate_slab_cache(struct kmem_cache *s); +static inline bool __slub_debug_enabled(void) +{ + return static_branch_unlikely(&slub_debug_enabled); +} +#else +static inline void print_tracking(struct kmem_cache *s, void *object) +{ +} +static inline bool __slub_debug_enabled(void) +{ + return false; +} +#endif + +/* + * Returns true if any of the specified slub_debug flags is enabled for the + * cache. Use only for flags parsed by setup_slub_debug() as it also enables + * the static key. + */ +static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) +{ + if (IS_ENABLED(CONFIG_SLUB_DEBUG)) + VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); + if (__slub_debug_enabled()) + return s->flags & flags; + return false; +} + +#ifdef CONFIG_MEMCG_KMEM +/* + * slab_objcgs - get the object cgroups vector associated with a slab + * @slab: a pointer to the slab struct + * + * Returns a pointer to the object cgroups vector associated with the slab, + * or NULL if no such vector has been associated yet. + */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + unsigned long memcg_data = READ_ONCE(slab->memcg_data); + + VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), + slab_page(slab)); + VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); + + return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +} + +int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr); + +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ + kfree(slab_objcgs(slab)); + slab->memcg_data = 0; +} + +static inline size_t obj_full_size(struct kmem_cache *s) +{ + /* + * For each accounted object there is an extra space which is used + * to store obj_cgroup membership. Charge it too. + */ + return s->size + sizeof(struct obj_cgroup *); +} + +/* + * Returns false if the allocation should fail. + */ +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + struct obj_cgroup *objcg; + + if (!memcg_kmem_enabled()) + return true; + + if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) + return true; + + objcg = get_obj_cgroup_from_current(); + if (!objcg) + return true; + + if (lru) { + int ret; + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + ret = memcg_list_lru_alloc(memcg, lru, flags); + css_put(&memcg->css); + + if (ret) + goto out; + } + + if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) + goto out; + + *objcgp = objcg; + return true; +out: + obj_cgroup_put(objcg); + return false; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ + struct slab *slab; + unsigned long off; + size_t i; + + if (!memcg_kmem_enabled() || !objcg) + return; + + for (i = 0; i < size; i++) { + if (likely(p[i])) { + slab = virt_to_slab(p[i]); + + if (!slab_objcgs(slab) && + memcg_alloc_slab_cgroups(slab, s, flags, + false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); + slab_objcgs(slab)[off] = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } else { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + } + } + obj_cgroup_put(objcg); +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ + struct obj_cgroup **objcgs; + int i; + + if (!memcg_kmem_enabled()) + return; + + objcgs = slab_objcgs(slab); + if (!objcgs) + return; + + for (i = 0; i < objects; i++) { + struct obj_cgroup *objcg; + unsigned int off; + + off = obj_to_index(s, slab, p[i]); + objcg = objcgs[off]; + if (!objcg) + continue; + + objcgs[off] = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); + obj_cgroup_put(objcg); + } +} + +#else /* CONFIG_MEMCG_KMEM */ +static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +{ + return NULL; +} + +static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) +{ + return NULL; +} + +static inline int memcg_alloc_slab_cgroups(struct slab *slab, + struct kmem_cache *s, gfp_t gfp, + bool new_slab) +{ + return 0; +} + +static inline void memcg_free_slab_cgroups(struct slab *slab) +{ +} + +static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t objects, gfp_t flags) +{ + return true; +} + +static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, + gfp_t flags, size_t size, + void **p) +{ +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +#ifndef CONFIG_SLOB +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct slab *slab; + + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", + __func__)) + return NULL; + return slab->slab_cache; +} + +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) +{ + if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_slab_cgroups(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) +{ + if (memcg_kmem_enabled()) + memcg_free_slab_cgroups(slab); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + +void free_large_kmalloc(struct folio *folio, void *object); + +#endif /* CONFIG_SLOB */ + +size_t __ksize(const void *objp); + +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, + size_t size, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (should_failslab(s, flags)) + return NULL; + + if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) + return NULL; + + return s; +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, + struct obj_cgroup *objcg, gfp_t flags, + size_t size, void **p, bool init) +{ + size_t i; + + flags &= gfp_allowed_mask; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], flags, init); + if (p[i] && init && !kasan_has_integrated_init()) + memset(p[i], 0, s->object_size); + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); + } + + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +} + +#ifndef CONFIG_SLOB +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + +#ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ + struct list_head slabs_full; + struct list_head slabs_free; + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ + unsigned long free_objects; + unsigned int free_limit; + unsigned int colour_next; /* Per-node cache coloring */ + struct array_cache *shared; /* shared per node */ + struct alien_cache **alien; /* on other nodes */ + unsigned long next_reap; /* updated without locking */ + int free_touched; /* updated without locking */ +#endif + +#ifdef CONFIG_SLUB + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif +#endif + +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +#endif + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp); +void cache_random_seq_destroy(struct kmem_cache *cachep); +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, + unsigned int count, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) +{ + if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc)) { + if (c->ctor) + return false; + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; + } + return flags & __GFP_ZERO; +} + +static inline bool slab_want_init_on_free(struct kmem_cache *c) +{ + if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, + &init_on_free)) + return !(c->ctor || + (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); + return false; +} + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +void debugfs_slab_release(struct kmem_cache *); +#else +static inline void debugfs_slab_release(struct kmem_cache *s) { } +#endif + +#ifdef CONFIG_PRINTK +#define KS_ADDRS_COUNT 16 +struct kmem_obj_info { + void *kp_ptr; + struct slab *kp_slab; + void *kp_objp; + unsigned long kp_data_offset; + struct kmem_cache *kp_slab_cache; + void *kp_ret; + void *kp_stack[KS_ADDRS_COUNT]; + void *kp_free_stack[KS_ADDRS_COUNT]; +}; +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); +#endif + +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user); +#else +static inline +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) +{ +} +#endif + +#endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c new file mode 100644 index 000000000..4736c0e60 --- /dev/null +++ b/mm/slab_common.c @@ -0,0 +1,1456 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Slab allocator functions that are independent of the allocator strategy + * + * (C) 2012 Christoph Lameter + */ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "slab.h" + +#define CREATE_TRACE_POINTS +#include + +enum slab_state slab_state; +LIST_HEAD(slab_caches); +DEFINE_MUTEX(slab_mutex); +struct kmem_cache *kmem_cache; + +static LIST_HEAD(slab_caches_to_rcu_destroy); +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + slab_caches_to_rcu_destroy_workfn); + +/* + * Set of flags that will prevent slab merging + */ +#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB | kasan_never_merge()) + +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_ACCOUNT) + +/* + * Merge control. If this is set then no merging of slab caches will occur. + */ +static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + +static int __init setup_slab_nomerge(char *str) +{ + slab_nomerge = true; + return 1; +} + +static int __init setup_slab_merge(char *str) +{ + slab_nomerge = false; + return 1; +} + +#ifdef CONFIG_SLUB +__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); +__setup_param("slub_merge", slub_merge, setup_slab_merge, 0); +#endif + +__setup("slab_nomerge", setup_slab_nomerge); +__setup("slab_merge", setup_slab_merge); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->object_size; +} +EXPORT_SYMBOL(kmem_cache_size); + +#ifdef CONFIG_DEBUG_VM +static int kmem_cache_sanity_check(const char *name, unsigned int size) +{ + if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { + pr_err("kmem_cache_create(%s) integrity check failed\n", name); + return -EINVAL; + } + + WARN_ON(strchr(name, ' ')); /* It confuses parsers */ + return 0; +} +#else +static inline int kmem_cache_sanity_check(const char *name, unsigned int size) +{ + return 0; +} +#endif + +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +static unsigned int calculate_alignment(slab_flags_t flags, + unsigned int align, unsigned int size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned int ralign; + + ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + align = max(align, arch_slab_minalign()); + + return ALIGN(align, sizeof(void *)); +} + +/* + * Find a mergeable slab cache + */ +int slab_unmergeable(struct kmem_cache *s) +{ + if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) + return 1; + + if (s->ctor) + return 1; + + if (s->usersize) + return 1; + + /* + * We may have set a slab to be unmergeable during bootstrap. + */ + if (s->refcount < 0) + return 1; + + return 0; +} + +struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, + slab_flags_t flags, const char *name, void (*ctor)(void *)) +{ + struct kmem_cache *s; + + if (slab_nomerge) + return NULL; + + if (ctor) + return NULL; + + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); + flags = kmem_cache_flags(size, flags, name); + + if (flags & SLAB_NEVER_MERGE) + return NULL; + + list_for_each_entry_reverse(s, &slab_caches, list) { + if (slab_unmergeable(s)) + continue; + + if (size > s->size) + continue; + + if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align - 1)) != s->size) + continue; + + if (s->size - size >= sizeof(void *)) + continue; + + if (IS_ENABLED(CONFIG_SLAB) && align && + (align > s->align || s->align % align)) + continue; + + return s; + } + return NULL; +} + +static struct kmem_cache *create_cache(const char *name, + unsigned int object_size, unsigned int align, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize, void (*ctor)(void *), + struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + if (WARN_ON(useroffset + usersize > object_size)) + useroffset = usersize = 0; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->size = s->object_size = object_size; + s->align = align; + s->ctor = ctor; + s->useroffset = useroffset; + s->usersize = usersize; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + kmem_cache_free(kmem_cache, s); + goto out; +} + +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache * +kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int align, + slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + struct kmem_cache *s = NULL; + const char *cache_name; + int err; + +#ifdef CONFIG_SLUB_DEBUG + /* + * If no slub_debug was enabled globally, the static key is not yet + * enabled by setup_slub_debug(). Enable it if the cache is being + * created with any of the debugging flags passed explicitly. + * It's also possible that this is the first cache created with + * SLAB_STORE_USER and we should init stack_depot for it. + */ + if (flags & SLAB_DEBUG_FLAGS) + static_branch_enable(&slub_debug_enabled); + if (flags & SLAB_STORE_USER) + stack_depot_init(); +#endif + + mutex_lock(&slab_mutex); + + err = kmem_cache_sanity_check(name, size); + if (err) { + goto out_unlock; + } + + /* Refuse requests with allocator specific flags */ + if (flags & ~SLAB_FLAGS_PERMITTED) { + err = -EINVAL; + goto out_unlock; + } + + /* + * Some allocators will constraint the set of valid flags to a subset + * of all flags. We expect them to define CACHE_CREATE_MASK in this + * case, and we'll just provide them with a sanitized version of the + * passed flags. + */ + flags &= CACHE_CREATE_MASK; + + /* Fail closed on bad usersize of useroffset values. */ + if (WARN_ON(!usersize && useroffset) || + WARN_ON(size < usersize || size - usersize < useroffset)) + usersize = useroffset = 0; + + if (!usersize) + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_unlock; + + cache_name = kstrdup_const(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; + goto out_unlock; + } + + s = create_cache(cache_name, size, + calculate_alignment(flags, align, size), + flags, useroffset, usersize, ctor, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree_const(cache_name); + } + +out_unlock: + mutex_unlock(&slab_mutex); + + if (err) { + if (flags & SLAB_PANIC) + panic("%s: Failed to create slab '%s'. Error %d\n", + __func__, name, err); + else { + pr_warn("%s(%s) failed with error %d\n", + __func__, name, err); + dump_stack(); + } + return NULL; + } + return s; +} +EXPORT_SYMBOL(kmem_cache_create_usercopy); + +/** + * kmem_cache_create - Create a cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache * +kmem_cache_create(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ + return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, + ctor); +} +EXPORT_SYMBOL(kmem_cache_create); + +#ifdef SLAB_SUPPORTS_SYSFS +/* + * For a given kmem_cache, kmem_cache_destroy() should only be called + * once or there will be a use-after-free problem. The actual deletion + * and release of the kobject does not need slab_mutex or cpu_hotplug_lock + * protection. So they are now done without holding those locks. + * + * Note that there will be a slight delay in the deletion of sysfs files + * if kmem_cache_release() is called indrectly from a work function. + */ +static void kmem_cache_release(struct kmem_cache *s) +{ + sysfs_slab_unlink(s); + sysfs_slab_release(s); +} +#else +static void kmem_cache_release(struct kmem_cache *s) +{ + slab_kmem_cache_release(s); +} +#endif + +static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) +{ + LIST_HEAD(to_destroy); + struct kmem_cache *s, *s2; + + /* + * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the + * @slab_caches_to_rcu_destroy list. The slab pages are freed + * through RCU and the associated kmem_cache are dereferenced + * while freeing the pages, so the kmem_caches should be freed only + * after the pending RCU operations are finished. As rcu_barrier() + * is a pretty slow operation, we batch all pending destructions + * asynchronously. + */ + mutex_lock(&slab_mutex); + list_splice_init(&slab_caches_to_rcu_destroy, &to_destroy); + mutex_unlock(&slab_mutex); + + if (list_empty(&to_destroy)) + return; + + rcu_barrier(); + + list_for_each_entry_safe(s, s2, &to_destroy, list) { + debugfs_slab_release(s); + kfence_shutdown_cache(s); + kmem_cache_release(s); + } +} + +static int shutdown_cache(struct kmem_cache *s) +{ + /* free asan quarantined objects */ + kasan_cache_shutdown(s); + + if (__kmem_cache_shutdown(s) != 0) + return -EBUSY; + + list_del(&s->list); + + if (s->flags & SLAB_TYPESAFE_BY_RCU) { + list_add_tail(&s->list, &slab_caches_to_rcu_destroy); + schedule_work(&slab_caches_to_rcu_destroy_work); + } else { + kfence_shutdown_cache(s); + debugfs_slab_release(s); + } + + return 0; +} + +void slab_kmem_cache_release(struct kmem_cache *s) +{ + __kmem_cache_release(s); + kfree_const(s->name); + kmem_cache_free(kmem_cache, s); +} + +void kmem_cache_destroy(struct kmem_cache *s) +{ + int err = -EBUSY; + bool rcu_set; + + if (unlikely(!s) || !kasan_check_byte(s)) + return; + + cpus_read_lock(); + mutex_lock(&slab_mutex); + + rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; + + s->refcount--; + if (s->refcount) + goto out_unlock; + + err = shutdown_cache(s); + WARN(err, "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); +out_unlock: + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + if (!err && !rcu_set) + kmem_cache_release(s); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + * + * Return: %0 if all slabs were released, non-zero otherwise + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + kasan_cache_shrink(cachep); + + return __kmem_cache_shrink(cachep); +} +EXPORT_SYMBOL(kmem_cache_shrink); + +bool slab_is_available(void) +{ + return slab_state >= UP; +} + +#ifdef CONFIG_PRINTK +/** + * kmem_valid_obj - does the pointer reference a valid slab object? + * @object: pointer to query. + * + * Return: %true if the pointer is to a not-yet-freed object from + * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer + * is to an already-freed object, and %false otherwise. + */ +bool kmem_valid_obj(void *object) +{ + struct folio *folio; + + /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ + if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) + return false; + folio = virt_to_folio(object); + return folio_test_slab(folio); +} +EXPORT_SYMBOL_GPL(kmem_valid_obj); + +static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + if (__kfence_obj_info(kpp, object, slab)) + return; + __kmem_obj_info(kpp, object, slab); +} + +/** + * kmem_dump_obj - Print available slab provenance information + * @object: slab object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For a slab-cache object, the fact that it is a slab object is printed, + * and, if available, the slab name, return address, and stack trace from + * the allocation and last free path of that object. + * + * This function will splat if passed a pointer to a non-slab object. + * If you are not sure what type of object you have, you should instead + * use mem_dump_obj(). + */ +void kmem_dump_obj(void *object) +{ + char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; + int i; + struct slab *slab; + unsigned long ptroffset; + struct kmem_obj_info kp = { }; + + if (WARN_ON_ONCE(!virt_addr_valid(object))) + return; + slab = virt_to_slab(object); + if (WARN_ON_ONCE(!slab)) { + pr_cont(" non-slab memory.\n"); + return; + } + kmem_obj_info(&kp, object, slab); + if (kp.kp_slab_cache) + pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); + else + pr_cont(" slab%s", cp); + if (is_kfence_address(object)) + pr_cont(" (kfence)"); + if (kp.kp_objp) + pr_cont(" start %px", kp.kp_objp); + if (kp.kp_data_offset) + pr_cont(" data offset %lu", kp.kp_data_offset); + if (kp.kp_objp) { + ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; + pr_cont(" pointer offset %lu", ptroffset); + } + if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) + pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_ret) + pr_cont(" allocated at %pS\n", kp.kp_ret); + else + pr_cont("\n"); + for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) { + if (!kp.kp_stack[i]) + break; + pr_info(" %pS\n", kp.kp_stack[i]); + } + + if (kp.kp_free_stack[0]) + pr_cont(" Free path:\n"); + + for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) { + if (!kp.kp_free_stack[i]) + break; + pr_info(" %pS\n", kp.kp_free_stack[i]); + } + +} +EXPORT_SYMBOL_GPL(kmem_dump_obj); +#endif + +#ifndef CONFIG_SLOB +/* Create a cache during boot when no slab services are available yet */ +void __init create_boot_cache(struct kmem_cache *s, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) +{ + int err; + unsigned int align = ARCH_KMALLOC_MINALIGN; + + s->name = name; + s->size = s->object_size = size; + + /* + * For power of two sizes, guarantee natural alignment for kmalloc + * caches, regardless of SL*B debugging options. + */ + if (is_power_of_2(size)) + align = max(align, size); + s->align = calculate_alignment(flags, align, size); + + s->useroffset = useroffset; + s->usersize = usersize; + + err = __kmem_cache_create(s, flags); + + if (err) + panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", + name, size, err); + + s->refcount = -1; /* Exempt from merging for now */ +} + +struct kmem_cache *__init create_kmalloc_cache(const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) +{ + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + + if (!s) + panic("Out of memory when creating slab %s\n", name); + + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, + usersize); + kasan_cache_create_kmalloc(s); + list_add(&s->list, &slab_caches); + s->refcount = 1; + return s; +} + +struct kmem_cache * +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = +{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ }; +EXPORT_SYMBOL(kmalloc_caches); + +/* + * Conversion table for small slabs sizes / 8 to the index in the + * kmalloc array. This is necessary for slabs < 192 since we have non power + * of two cache sizes there. The size of larger slabs can be determined using + * fls. + */ +static u8 size_index[24] __ro_after_init = { + 3, /* 8 */ + 4, /* 16 */ + 5, /* 24 */ + 5, /* 32 */ + 6, /* 40 */ + 6, /* 48 */ + 6, /* 56 */ + 6, /* 64 */ + 1, /* 72 */ + 1, /* 80 */ + 1, /* 88 */ + 1, /* 96 */ + 7, /* 104 */ + 7, /* 112 */ + 7, /* 120 */ + 7, /* 128 */ + 2, /* 136 */ + 2, /* 144 */ + 2, /* 152 */ + 2, /* 160 */ + 2, /* 168 */ + 2, /* 176 */ + 2, /* 184 */ + 2 /* 192 */ +}; + +static inline unsigned int size_index_elem(unsigned int bytes) +{ + return (bytes - 1) / 8; +} + +/* + * Find the kmem_cache structure that serves a given size of + * allocation + */ +struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) +{ + unsigned int index; + + if (size <= 192) { + if (!size) + return ZERO_SIZE_PTR; + + index = size_index[size_index_elem(size)]; + } else { + if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + index = fls(size - 1); + } + + return kmalloc_caches[kmalloc_type(flags)][index]; +} + +size_t kmalloc_size_roundup(size_t size) +{ + struct kmem_cache *c; + + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + /* Above the smaller buckets, size is a multiple of page size. */ + if (size > KMALLOC_MAX_CACHE_SIZE) + return PAGE_SIZE << get_order(size); + + /* The flags don't matter since size_index is common to all. */ + c = kmalloc_slab(size, GFP_KERNEL); + return c ? c->object_size : 0; +} +EXPORT_SYMBOL(kmalloc_size_roundup); + +#ifdef CONFIG_ZONE_DMA +#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, +#else +#define KMALLOC_DMA_NAME(sz) +#endif + +#ifdef CONFIG_MEMCG_KMEM +#define KMALLOC_CGROUP_NAME(sz) .name[KMALLOC_CGROUP] = "kmalloc-cg-" #sz, +#else +#define KMALLOC_CGROUP_NAME(sz) +#endif + +#define INIT_KMALLOC_INFO(__size, __short_size) \ +{ \ + .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ + .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + KMALLOC_CGROUP_NAME(__short_size) \ + KMALLOC_DMA_NAME(__short_size) \ + .size = __size, \ +} + +/* + * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. + */ +const struct kmalloc_info_struct kmalloc_info[] __initconst = { + INIT_KMALLOC_INFO(0, 0), + INIT_KMALLOC_INFO(96, 96), + INIT_KMALLOC_INFO(192, 192), + INIT_KMALLOC_INFO(8, 8), + INIT_KMALLOC_INFO(16, 16), + INIT_KMALLOC_INFO(32, 32), + INIT_KMALLOC_INFO(64, 64), + INIT_KMALLOC_INFO(128, 128), + INIT_KMALLOC_INFO(256, 256), + INIT_KMALLOC_INFO(512, 512), + INIT_KMALLOC_INFO(1024, 1k), + INIT_KMALLOC_INFO(2048, 2k), + INIT_KMALLOC_INFO(4096, 4k), + INIT_KMALLOC_INFO(8192, 8k), + INIT_KMALLOC_INFO(16384, 16k), + INIT_KMALLOC_INFO(32768, 32k), + INIT_KMALLOC_INFO(65536, 64k), + INIT_KMALLOC_INFO(131072, 128k), + INIT_KMALLOC_INFO(262144, 256k), + INIT_KMALLOC_INFO(524288, 512k), + INIT_KMALLOC_INFO(1048576, 1M), + INIT_KMALLOC_INFO(2097152, 2M) +}; + +/* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ +void __init setup_kmalloc_cache_index_table(void) +{ + unsigned int i; + + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || + !is_power_of_2(KMALLOC_MIN_SIZE)); + + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + unsigned int elem = size_index_elem(i); + + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } + + if (KMALLOC_MIN_SIZE >= 64) { + /* + * The 96 byte sized cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + + } + + if (KMALLOC_MIN_SIZE >= 128) { + /* + * The 192 byte sized cache is not used if the alignment + * is 128 byte. Redirect kmalloc to use the 256 byte cache + * instead. + */ + for (i = 128 + 8; i <= 192; i += 8) + size_index[size_index_elem(i)] = 8; + } +} + +static void __init +new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) +{ + if (type == KMALLOC_RECLAIM) { + flags |= SLAB_RECLAIM_ACCOUNT; + } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { + if (mem_cgroup_kmem_disabled()) { + kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; + return; + } + flags |= SLAB_ACCOUNT; + } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) { + flags |= SLAB_CACHE_DMA; + } + + kmalloc_caches[type][idx] = create_kmalloc_cache( + kmalloc_info[idx].name[type], + kmalloc_info[idx].size, flags, 0, + kmalloc_info[idx].size); + + /* + * If CONFIG_MEMCG_KMEM is enabled, disable cache merging for + * KMALLOC_NORMAL caches. + */ + if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_NORMAL)) + kmalloc_caches[type][idx]->refcount = -1; +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(slab_flags_t flags) +{ + int i; + enum kmalloc_cache_type type; + + /* + * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined + */ + for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (!kmalloc_caches[type][i]) + new_kmalloc_cache(i, type, flags); + + /* + * Caches that are not of the two-to-the-power-of size. + * These have to be created immediately after the + * earlier power of two caches + */ + if (KMALLOC_MIN_SIZE <= 32 && i == 6 && + !kmalloc_caches[type][1]) + new_kmalloc_cache(1, type, flags); + if (KMALLOC_MIN_SIZE <= 64 && i == 7 && + !kmalloc_caches[type][2]) + new_kmalloc_cache(2, type, flags); + } + } + + /* Kmalloc array is now usable */ + slab_state = UP; +} + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(caller, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = __kmem_cache_alloc_node(s, flags, node, size, caller); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(caller, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc. + * + * If @object is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + __kmem_cache_free(s, (void *)object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/** + * __ksize -- Report full size of underlying allocation + * @object: pointer to the object + * + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. + * + * Return: size of the actual memory used by @object in bytes + */ +size_t __ksize(const void *object) +{ + struct folio *folio; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(object); + + if (unlikely(!folio_test_slab(folio))) { + if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) + return 0; + if (WARN_ON(object != folio_address(folio))) + return 0; + return folio_size(folio); + } + + return slab_ksize(folio_slab(folio)->slab_cache); +} + +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); +#endif /* !CONFIG_SLOB */ + +gfp_t kmalloc_fix_flags(gfp_t flags) +{ + gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; + + flags &= ~GFP_SLAB_BUG_MASK; + pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", + invalid_mask, &invalid_mask, flags, &flags); + dump_stack(); + + return flags; +} + +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); + + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + flags |= __GFP_COMP; + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + } + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); + return ret; +} +EXPORT_SYMBOL(kmalloc_large); + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); + return ret; +} +EXPORT_SYMBOL(kmalloc_large_node); + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Randomize a generic freelist */ +static void freelist_randomize(struct rnd_state *state, unsigned int *list, + unsigned int count) +{ + unsigned int rand; + unsigned int i; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, + gfp_t gfp) +{ + struct rnd_state state; + + if (count < 2 || cachep->random_seq) + return 0; + + cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage of boot */ + prandom_seed_state(&state, get_random_long()); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +#ifdef CONFIG_SLAB +#define SLABINFO_RIGHTS (0600) +#else +#define SLABINFO_RIGHTS (0400) +#endif + +static void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); +#else + seq_puts(m, "slabinfo - version: 2.1\n"); +#endif + seq_puts(m, "# name "); + seq_puts(m, " : tunables "); + seq_puts(m, " : slabdata "); +#ifdef CONFIG_DEBUG_SLAB + seq_puts(m, " : globalstat "); + seq_puts(m, " : cpustat "); +#endif + seq_putc(m, '\n'); +} + +static void *slab_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&slab_mutex); + return seq_list_start(&slab_caches, *pos); +} + +static void *slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &slab_caches, pos); +} + +static void slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + +static void cache_show(struct kmem_cache *s, struct seq_file *m) +{ + struct slabinfo sinfo; + + memset(&sinfo, 0, sizeof(sinfo)); + get_slabinfo(s, &sinfo); + + seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", + s->name, sinfo.active_objs, sinfo.num_objs, s->size, + sinfo.objects_per_slab, (1 << sinfo.cache_order)); + + seq_printf(m, " : tunables %4u %4u %4u", + sinfo.limit, sinfo.batchcount, sinfo.shared); + seq_printf(m, " : slabdata %6lu %6lu %6lu", + sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); + slabinfo_show_stats(m, s); + seq_putc(m, '\n'); +} + +static int slab_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + + if (p == slab_caches.next) + print_slabinfo_header(m); + cache_show(s, m); + return 0; +} + +void dump_unreclaimable_slab(void) +{ + struct kmem_cache *s; + struct slabinfo sinfo; + + /* + * Here acquiring slab_mutex is risky since we don't prefer to get + * sleep in oom path. But, without mutex hold, it may introduce a + * risk of crash. + * Use mutex_trylock to protect the list traverse, dump nothing + * without acquiring the mutex. + */ + if (!mutex_trylock(&slab_mutex)) { + pr_warn("excessive unreclaimable slab but cannot dump stats\n"); + return; + } + + pr_info("Unreclaimable slab info:\n"); + pr_info("Name Used Total\n"); + + list_for_each_entry(s, &slab_caches, list) { + if (s->flags & SLAB_RECLAIM_ACCOUNT) + continue; + + get_slabinfo(s, &sinfo); + + if (sinfo.num_objs > 0) + pr_info("%-17s %10luKB %10luKB\n", s->name, + (sinfo.active_objs * s->size) / 1024, + (sinfo.num_objs * s->size) / 1024); + } + mutex_unlock(&slab_mutex); +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + * + * Output layout: + * cache-name + * num-active-objs + * total-objs + * object size + * num-active-slabs + * total-slabs + * num-pages-per-slab + * + further values on SMP and with statistics enabled + */ +static const struct seq_operations slabinfo_op = { + .start = slab_start, + .next = slab_next, + .stop = slab_stop, + .show = slab_show, +}; + +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct proc_ops slabinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = slabinfo_open, + .proc_read = seq_read, + .proc_write = slabinfo_write, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops); + return 0; +} +module_init(slab_proc_init); + +#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ + +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks; + + /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + if (likely(!ZERO_OR_NULL_PTR(p))) { + if (!kasan_check_byte(p)) + return NULL; + ks = kfence_ksize(p) ?: __ksize(p); + } else + ks = 0; + + /* If the object still fits, repoison it precisely. */ + if (ks >= new_size) { + p = kasan_krealloc((void *)p, new_size, flags); + return (void *)p; + } + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), ks); + kasan_enable_current(); + } + + return ret; +} + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored). + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kfree_sensitive - Clear sensitive information in memory before freeing + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kfree_sensitive() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kfree_sensitive(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + ks = ksize(mem); + if (ks) + memzero_explicit(mem, ks); + kfree(mem); +} +EXPORT_SYMBOL(kfree_sensitive); + +size_t ksize(const void *objp) +{ + size_t size; + + /* + * We need to first check that the pointer to the object is valid, and + * only then unpoison the memory. The report printed from ksize() is + * more useful, then when it's printed later when the behaviour could + * be undefined due to a potential use-after-free or double-free. + * + * We use kasan_check_byte(), which is supported for the hardware + * tag-based KASAN mode, unlike kasan_check_read/write(). + * + * If the pointed to memory is invalid, we return 0 to avoid users of + * ksize() writing to and potentially corrupting the memory region. + * + * We want to perform the check before __ksize(), to avoid potentially + * crashing in __ksize() due to accessing invalid metadata. + */ + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) + return 0; + + size = kfence_ksize(objp) ?: __ksize(objp); + /* + * We assume that ksize callers could use whole allocated area, + * so we need to unpoison this area. + */ + kasan_unpoison_range(objp, size); + return size; +} +EXPORT_SYMBOL(ksize); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); + +int should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + if (__should_failslab(s, gfpflags)) + return -ENOMEM; + return 0; +} +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); diff --git a/mm/slob.c b/mm/slob.c new file mode 100644 index 000000000..fe567fcfa --- /dev/null +++ b/mm/slob.c @@ -0,0 +1,757 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SLOB Allocator: Simple List Of Blocks + * + * Matt Mackall 12/30/03 + * + * NUMA support by Paul Mundt, 2007. + * + * How SLOB works: + * + * The core of SLOB is a traditional K&R style heap allocator, with + * support for returning aligned objects. The granularity of this + * allocator is as little as 2 bytes, however typically most architectures + * will require 4 bytes on 32-bit and 8 bytes on 64-bit. + * + * The slob heap is a set of linked list of pages from alloc_pages(), + * and within each page, there is a singly-linked list of free blocks + * (slob_t). The heap is grown on demand. To reduce fragmentation, + * heap pages are segregated into three lists, with objects less than + * 256 bytes, objects less than 1024 bytes, and all other objects. + * + * Allocation from heap involves first searching for a page with + * sufficient free blocks (using a next-fit-like approach) followed by + * a first-fit scan of the page. Deallocation inserts objects back + * into the free list in address order, so this is effectively an + * address-ordered first fit. + * + * Above this is an implementation of kmalloc/kfree. Blocks returned + * from kmalloc are prepended with a 4-byte header with the kmalloc size. + * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls + * alloc_pages() directly, allocating compound pages so the page order + * does not have to be separately tracked. + * These objects are detected in kfree() because folio_test_slab() + * is false for them. + * + * SLAB is emulated on top of SLOB by simply calling constructors and + * destructors for every SLAB allocation. Objects are returned with the + * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which + * case the low-level allocator will fragment blocks to create the proper + * alignment. Again, objects of page-size or greater are allocated by + * calling alloc_pages(). As SLAB objects know their size, no separate + * size bookkeeping is necessary and there is essentially no allocation + * space overhead, and compound pages aren't needed for multi-page + * allocations. + * + * NUMA support in SLOB is fairly simplistic, pushing most of the real + * logic down to the page allocator, and simply doing the node accounting + * on the upper levels. In the event that a node id is explicitly + * provided, __alloc_pages_node() with the specified node id is used + * instead. The common case (or when the node id isn't explicitly provided) + * will default to the current node, as per numa_node_id(). + * + * Node aware pages are still inserted in to the global freelist, and + * these are scanned for by matching against the node id encoded in the + * page flags. As a result, block allocations that can be satisfied from + * the freelist will only be done so on pages residing on the same node, + * in order to prevent random node placement. + */ + +#include +#include + +#include +#include /* struct reclaim_state */ +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "slab.h" +/* + * slob_block has a field 'units', which indicates size of block if +ve, + * or offset of next block if -ve (in SLOB_UNITs). + * + * Free blocks of size 1 unit simply contain the offset of the next block. + * Those with larger size contain their size in the first SLOB_UNIT of + * memory, and the offset of the next free block in the second SLOB_UNIT. + */ +#if PAGE_SIZE <= (32767 * 2) +typedef s16 slobidx_t; +#else +typedef s32 slobidx_t; +#endif + +struct slob_block { + slobidx_t units; +}; +typedef struct slob_block slob_t; + +/* + * All partially free slob pages go on these lists. + */ +#define SLOB_BREAK1 256 +#define SLOB_BREAK2 1024 +static LIST_HEAD(free_slob_small); +static LIST_HEAD(free_slob_medium); +static LIST_HEAD(free_slob_large); + +/* + * slob_page_free: true for pages on free_slob_pages list. + */ +static inline int slob_page_free(struct slab *slab) +{ + return PageSlobFree(slab_page(slab)); +} + +static void set_slob_page_free(struct slab *slab, struct list_head *list) +{ + list_add(&slab->slab_list, list); + __SetPageSlobFree(slab_page(slab)); +} + +static inline void clear_slob_page_free(struct slab *slab) +{ + list_del(&slab->slab_list); + __ClearPageSlobFree(slab_page(slab)); +} + +#define SLOB_UNIT sizeof(slob_t) +#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) + +/* + * struct slob_rcu is inserted at the tail of allocated slob blocks, which + * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free + * the block using call_rcu. + */ +struct slob_rcu { + struct rcu_head head; + int size; +}; + +/* + * slob_lock protects all slob allocator structures. + */ +static DEFINE_SPINLOCK(slob_lock); + +/* + * Encode the given size and next info into a free slob block s. + */ +static void set_slob(slob_t *s, slobidx_t size, slob_t *next) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t offset = next - base; + + if (size > 1) { + s[0].units = size; + s[1].units = offset; + } else + s[0].units = -offset; +} + +/* + * Return the size of a slob block. + */ +static slobidx_t slob_units(slob_t *s) +{ + if (s->units > 0) + return s->units; + return 1; +} + +/* + * Return the next free slob block pointer after this one. + */ +static slob_t *slob_next(slob_t *s) +{ + slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); + slobidx_t next; + + if (s[0].units < 0) + next = -s[0].units; + else + next = s[1].units; + return base+next; +} + +/* + * Returns true if s is the last free block in its page. + */ +static int slob_last(slob_t *s) +{ + return !((unsigned long)slob_next(s) & ~PAGE_MASK); +} + +static void *slob_new_pages(gfp_t gfp, int order, int node) +{ + struct page *page; + +#ifdef CONFIG_NUMA + if (node != NUMA_NO_NODE) + page = __alloc_pages_node(node, gfp, order); + else +#endif + page = alloc_pages(gfp, order); + + if (!page) + return NULL; + + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + return page_address(page); +} + +static void slob_free_pages(void *b, int order) +{ + struct page *sp = virt_to_page(b); + + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; + + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(sp, order); +} + +/* + * slob_page_alloc() - Allocate a slob block within a given slob_page sp. + * @sp: Page to look in. + * @size: Size of the allocation. + * @align: Allocation alignment. + * @align_offset: Offset in the allocated block that will be aligned. + * @page_removed_from_list: Return parameter. + * + * Tries to find a chunk of memory at least @size bytes big within @page. + * + * Return: Pointer to memory if allocated, %NULL otherwise. If the + * allocation fills up @page then the page is removed from the + * freelist, in this case @page_removed_from_list will be set to + * true (set to false otherwise). + */ +static void *slob_page_alloc(struct slab *sp, size_t size, int align, + int align_offset, bool *page_removed_from_list) +{ + slob_t *prev, *cur, *aligned = NULL; + int delta = 0, units = SLOB_UNITS(size); + + *page_removed_from_list = false; + for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { + slobidx_t avail = slob_units(cur); + + /* + * 'aligned' will hold the address of the slob block so that the + * address 'aligned'+'align_offset' is aligned according to the + * 'align' parameter. This is for kmalloc() which prepends the + * allocated block with its size, so that the block itself is + * aligned when needed. + */ + if (align) { + aligned = (slob_t *) + (ALIGN((unsigned long)cur + align_offset, align) + - align_offset); + delta = aligned - cur; + } + if (avail >= units + delta) { /* room enough? */ + slob_t *next; + + if (delta) { /* need to fragment head to align? */ + next = slob_next(cur); + set_slob(aligned, avail - delta, next); + set_slob(cur, delta, aligned); + prev = cur; + cur = aligned; + avail = slob_units(cur); + } + + next = slob_next(cur); + if (avail == units) { /* exact fit? unlink. */ + if (prev) + set_slob(prev, slob_units(prev), next); + else + sp->freelist = next; + } else { /* fragment */ + if (prev) + set_slob(prev, slob_units(prev), cur + units); + else + sp->freelist = cur + units; + set_slob(cur + units, avail - units, next); + } + + sp->units -= units; + if (!sp->units) { + clear_slob_page_free(sp); + *page_removed_from_list = true; + } + return cur; + } + if (slob_last(cur)) + return NULL; + } +} + +/* + * slob_alloc: entry point into the slob allocator. + */ +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, + int align_offset) +{ + struct folio *folio; + struct slab *sp; + struct list_head *slob_list; + slob_t *b = NULL; + unsigned long flags; + bool _unused; + + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + + spin_lock_irqsave(&slob_lock, flags); + /* Iterate through each partially free page, try to find room */ + list_for_each_entry(sp, slob_list, slab_list) { + bool page_removed_from_list = false; +#ifdef CONFIG_NUMA + /* + * If there's a node specification, search for a partial + * page with a matching node id in the freelist. + */ + if (node != NUMA_NO_NODE && slab_nid(sp) != node) + continue; +#endif + /* Enough room on this page? */ + if (sp->units < SLOB_UNITS(size)) + continue; + + b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); + if (!b) + continue; + + /* + * If slob_page_alloc() removed sp from the list then we + * cannot call list functions on sp. If so allocation + * did not fragment the page anyway so optimisation is + * unnecessary. + */ + if (!page_removed_from_list) { + /* + * Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) + */ + if (!list_is_first(&sp->slab_list, slob_list)) + list_rotate_to_front(&sp->slab_list, slob_list); + } + break; + } + spin_unlock_irqrestore(&slob_lock, flags); + + /* Not enough space: must allocate a new page */ + if (!b) { + b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node); + if (!b) + return NULL; + folio = virt_to_folio(b); + __folio_set_slab(folio); + sp = folio_slab(folio); + + spin_lock_irqsave(&slob_lock, flags); + sp->units = SLOB_UNITS(PAGE_SIZE); + sp->freelist = b; + INIT_LIST_HEAD(&sp->slab_list); + set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); + set_slob_page_free(sp, slob_list); + b = slob_page_alloc(sp, size, align, align_offset, &_unused); + BUG_ON(!b); + spin_unlock_irqrestore(&slob_lock, flags); + } + if (unlikely(gfp & __GFP_ZERO)) + memset(b, 0, size); + return b; +} + +/* + * slob_free: entry point into the slob allocator. + */ +static void slob_free(void *block, int size) +{ + struct slab *sp; + slob_t *prev, *next, *b = (slob_t *)block; + slobidx_t units; + unsigned long flags; + struct list_head *slob_list; + + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + BUG_ON(!size); + + sp = virt_to_slab(block); + units = SLOB_UNITS(size); + + spin_lock_irqsave(&slob_lock, flags); + + if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { + /* Go directly to page allocator. Do not pass slob allocator */ + if (slob_page_free(sp)) + clear_slob_page_free(sp); + spin_unlock_irqrestore(&slob_lock, flags); + __folio_clear_slab(slab_folio(sp)); + slob_free_pages(b, 0); + return; + } + + if (!slob_page_free(sp)) { + /* This slob page is about to become partially free. Easy! */ + sp->units = units; + sp->freelist = b; + set_slob(b, units, + (void *)((unsigned long)(b + + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + set_slob_page_free(sp, slob_list); + goto out; + } + + /* + * Otherwise the page is already partially free, so find reinsertion + * point. + */ + sp->units += units; + + if (b < (slob_t *)sp->freelist) { + if (b + units == sp->freelist) { + units += slob_units(sp->freelist); + sp->freelist = slob_next(sp->freelist); + } + set_slob(b, units, sp->freelist); + sp->freelist = b; + } else { + prev = sp->freelist; + next = slob_next(prev); + while (b > next) { + prev = next; + next = slob_next(prev); + } + + if (!slob_last(prev) && b + units == next) { + units += slob_units(next); + set_slob(b, units, slob_next(next)); + } else + set_slob(b, units, next); + + if (prev + slob_units(prev) == b) { + units = slob_units(b) + slob_units(prev); + set_slob(prev, units, slob_next(b)); + } else + set_slob(prev, slob_units(prev), b); + } +out: + spin_unlock_irqrestore(&slob_lock, flags); +} + +#ifdef CONFIG_PRINTK +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + kpp->kp_ptr = object; + kpp->kp_slab = slab; +} +#endif + +/* + * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. + */ + +static __always_inline void * +__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) +{ + unsigned int *m; + unsigned int minalign; + void *ret; + + minalign = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, + arch_slab_minalign()); + gfp &= gfp_allowed_mask; + + might_alloc(gfp); + + if (size < PAGE_SIZE - minalign) { + int align = minalign; + + /* + * For power of two sizes, guarantee natural alignment for + * kmalloc()'d objects. + */ + if (is_power_of_2(size)) + align = max_t(unsigned int, minalign, size); + + if (!size) + return ZERO_SIZE_PTR; + + m = slob_alloc(size + minalign, gfp, align, node, minalign); + + if (!m) + return NULL; + *m = size; + ret = (void *)m + minalign; + + trace_kmalloc(caller, ret, size, size + minalign, gfp, node); + } else { + unsigned int order = get_order(size); + + if (likely(order)) + gfp |= __GFP_COMP; + ret = slob_new_pages(gfp, order, node); + + trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); + } + + kmemleak_alloc(ret, size, 1, gfp); + return ret; +} + +void *__kmalloc(size_t size, gfp_t gfp) +{ + return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, gfp, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +void kfree(const void *block) +{ + struct folio *sp; + + trace_kfree(_RET_IP_, block); + + if (unlikely(ZERO_OR_NULL_PTR(block))) + return; + kmemleak_free(block); + + sp = virt_to_folio(block); + if (folio_test_slab(sp)) { + unsigned int align = max_t(unsigned int, + ARCH_KMALLOC_MINALIGN, + arch_slab_minalign()); + unsigned int *m = (unsigned int *)(block - align); + + slob_free(m, *m + align); + } else { + unsigned int order = folio_order(sp); + + mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(sp, 0), order); + + } +} +EXPORT_SYMBOL(kfree); + +size_t kmalloc_size_roundup(size_t size) +{ + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + + return ALIGN(size, ARCH_KMALLOC_MINALIGN); +} + +EXPORT_SYMBOL(kmalloc_size_roundup); + +/* can't use ksize for kmem_cache_alloc memory, only kmalloc */ +size_t __ksize(const void *block) +{ + struct folio *folio; + unsigned int align; + unsigned int *m; + + BUG_ON(!block); + if (unlikely(block == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(block); + if (unlikely(!folio_test_slab(folio))) + return folio_size(folio); + + align = max_t(unsigned int, ARCH_KMALLOC_MINALIGN, + arch_slab_minalign()); + m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; +} + +int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) +{ + if (flags & SLAB_TYPESAFE_BY_RCU) { + /* leave room for rcu footer at the end of object */ + c->size += sizeof(struct slob_rcu); + } + + /* Actual size allocated */ + c->size = SLOB_UNITS(c->size) * SLOB_UNIT; + c->flags = flags; + return 0; +} + +static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +{ + void *b; + + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (c->size < PAGE_SIZE) { + b = slob_alloc(c->size, flags, c->align, node, 0); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); + } else { + b = slob_new_pages(flags, get_order(c->size), node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); + } + + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); + c->ctor(b); + } + + kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); + return b; +} + +void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +{ + return slob_alloc_node(cachep, flags, NUMA_NO_NODE); +} +EXPORT_SYMBOL(kmem_cache_alloc); + + +void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags) +{ + return slob_alloc_node(cachep, flags, NUMA_NO_NODE); +} +EXPORT_SYMBOL(kmem_cache_alloc_lru); + +void *__kmalloc_node(size_t size, gfp_t gfp, int node) +{ + return __do_kmalloc_node(size, gfp, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) +{ + return slob_alloc_node(cachep, gfp, node); +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +static void __kmem_cache_free(void *b, int size) +{ + if (size < PAGE_SIZE) + slob_free(b, size); + else + slob_free_pages(b, get_order(size)); +} + +static void kmem_rcu_free(struct rcu_head *head) +{ + struct slob_rcu *slob_rcu = (struct slob_rcu *)head; + void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu)); + + __kmem_cache_free(b, slob_rcu->size); +} + +void kmem_cache_free(struct kmem_cache *c, void *b) +{ + kmemleak_free_recursive(b, c->flags); + trace_kmem_cache_free(_RET_IP_, b, c); + if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { + struct slob_rcu *slob_rcu; + slob_rcu = b + (c->size - sizeof(struct slob_rcu)); + slob_rcu->size = c->size; + call_rcu(&slob_rcu->head, kmem_rcu_free); + } else { + __kmem_cache_free(b, c->size); + } +} +EXPORT_SYMBOL(kmem_cache_free); + +void kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + + if (!x) { + kmem_cache_free_bulk(s, i, p); + return 0; + } + } + return i; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + +int __kmem_cache_shutdown(struct kmem_cache *c) +{ + /* No way to check for remaining objects */ + return 0; +} + +void __kmem_cache_release(struct kmem_cache *c) +{ +} + +int __kmem_cache_shrink(struct kmem_cache *d) +{ + return 0; +} + +static struct kmem_cache kmem_cache_boot = { + .name = "kmem_cache", + .size = sizeof(struct kmem_cache), + .flags = SLAB_PANIC, + .align = ARCH_KMALLOC_MINALIGN, +}; + +void __init kmem_cache_init(void) +{ + kmem_cache = &kmem_cache_boot; + slab_state = UP; +} + +void __init kmem_cache_init_late(void) +{ + slab_state = FULL; +} diff --git a/mm/slub.c b/mm/slub.c new file mode 100644 index 000000000..157527d71 --- /dev/null +++ b/mm/slub.c @@ -0,0 +1,6310 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * SLUB: A slab allocator that limits cache line use instead of queuing + * objects in per cpu and per node lists. + * + * The allocator synchronizes using per slab locks or atomic operations + * and only uses a centralized lock to manage a pool of partial slabs. + * + * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter + */ + +#include +#include /* struct reclaim_state */ +#include +#include +#include +#include +#include +#include +#include "slab.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + +/* + * Lock order: + * 1. slab_mutex (Global Mutex) + * 2. node->list_lock (Spinlock) + * 3. kmem_cache->cpu_slab->lock (Local lock) + * 4. slab_lock(slab) (Only on some arches) + * 5. object_map_lock (Only for debugging) + * + * slab_mutex + * + * The role of the slab_mutex is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * Also synchronizes memory hotplug callbacks. + * + * slab_lock + * + * The slab_lock is a wrapper around the page lock, thus it is a bit + * spinlock. + * + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * + * A. slab->freelist -> List of free objects in a slab + * B. slab->inuse -> Number of objects in use + * C. slab->objects -> Number of objects in slab + * D. slab->frozen -> frozen state + * + * Frozen slabs + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list except per cpu partial list. The processor that froze the + * slab is the one who can perform list operations on the slab. Other + * processors may put objects onto the freelist but the processor that + * froze the slab is the only one that can retrieve the objects from the + * slab's freelist. + * + * list_lock + * + * The list_lock protects the partial and full list on each node and + * the partial slab counter. If taken then no new slabs may be added or + * removed from the lists nor make the number of partial slabs be modified. + * (Note that the total number of slabs is an atomic value that may be + * modified without taking the list lock). + * + * The list_lock is a centralized lock and thus we avoid taking it as + * much as possible. As long as SLUB does not have to handle partial + * slabs, operations can continue without any centralized lock. F.e. + * allocating a long series of objects that fill up slabs does not require + * the list lock. + * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * + * cpu_slab->lock local lock + * + * This locks protect slowpath manipulation of all kmem_cache_cpu fields + * except the stat counters. This is a percpu structure manipulated only by + * the local cpu, so the lock protects against being preempted or interrupted + * by an irq. Fast path operations rely on lockless operations instead. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. + * + * lockless fastpaths + * + * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) + * are fully lockless when satisfied from the percpu slab (and when + * cmpxchg_double is possible to use, otherwise slab_lock is taken). + * They also don't disable preemption or migration or irqs. They rely on + * the transaction id (tid) field to detect being preempted or moved to + * another cpu. + * + * irq, preemption, migration considerations + * + * Interrupts are disabled as part of list_lock or local_lock operations, or + * around the slab_lock operation, in order to make the slab allocator safe + * to use in the context of an irq. + * + * In addition, preemption (or migration on PREEMPT_RT) is disabled in the + * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the + * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer + * doesn't have to be revalidated in each section protected by the local lock. + * + * SLUB assigns one slab for allocation to each processor. + * Allocations only occur from these slabs called cpu slabs. + * + * Slabs with free elements are kept on a partial list and during regular + * operations no list for full slabs is used. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * We track full slabs for debugging purposes though because otherwise we + * cannot scan all objects. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocs. + * + * slab->frozen The slab is frozen and exempt from list processing. + * This means that the slab is dedicated to a purpose + * such as satisfying allocations for a specific + * processor. Objects may be freed in the slab while + * it is frozen but slab_free will then skip the usual + * list operations. It is up to the processor holding + * the slab to integrate the slab into the slab lists + * when the slab is no longer needed. + * + * One use of this flag is to mark slabs that are + * used for allocations. Then such a slab becomes a cpu + * slab. The cpu slab may be equipped with an additional + * freelist that allows lockless access to + * free objects in addition to the regular freelist + * that requires the slab lock. + * + * SLAB_DEBUG_FLAGS Slab requires special handling due to debug + * options set. This moves slab handling out of + * the fast path and disables lockless freelists. + */ + +/* + * We could simply use migrate_disable()/enable() but as long as it's a + * function call even on !PREEMPT_RT, use inline preempt_disable() there. + */ +#ifndef CONFIG_PREEMPT_RT +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) +#else +#define slub_get_cpu_ptr(var) \ +({ \ + migrate_disable(); \ + this_cpu_ptr(var); \ +}) +#define slub_put_cpu_ptr(var) \ +do { \ + (void)(var); \ + migrate_enable(); \ +} while (0) +#define USE_LOCKLESS_FAST_PATH() (false) +#endif + +#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SLUB_DEBUG_ON +DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); +#else +DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +#endif /* CONFIG_SLUB_DEBUG */ + +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + struct slab **slab; + gfp_t flags; + unsigned int orig_size; +}; + +static inline bool kmem_cache_debug(struct kmem_cache *s) +{ + return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); +} + +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + +void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) + p += s->red_left_pad; + + return p; +} + +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + return !kmem_cache_debug(s); +#else + return false; +#endif +} + +/* + * Issues still to be resolved: + * + * - Support PAGE_ALLOC_DEBUG. Should be easy to do. + * + * - Variable sizing of the per node arrays + */ + +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + +/* + * Minimum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ +#define MIN_PARTIAL 5 + +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in use. + */ +#define MAX_PARTIAL 10 + +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ + SLAB_POISON | SLAB_STORE_USER) + +/* + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information + */ +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + + +/* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +#define OO_SHIFT 16 +#define OO_MASK ((1 << OO_SHIFT) - 1) +#define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */ + +/* Internal SLUB flags */ +/* Poison object */ +#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) +/* Use cmpxchg_double */ +#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) + +/* + * Tracking user of a slab. + */ +#define TRACK_ADDRS_COUNT 16 +struct track { + unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; +#endif + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ +}; + +enum track_item { TRACK_ALLOC, TRACK_FREE }; + +#ifdef CONFIG_SYSFS +static int sysfs_slab_add(struct kmem_cache *); +static int sysfs_slab_alias(struct kmem_cache *, const char *); +#else +static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) + { return 0; } +#endif + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +static void debugfs_slab_add(struct kmem_cache *); +#else +static inline void debugfs_slab_add(struct kmem_cache *s) { } +#endif + +static inline void stat(const struct kmem_cache *s, enum stat_item si) +{ +#ifdef CONFIG_SLUB_STATS + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); +#endif +} + +/* + * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. + * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily + * differ during memory hotplug/hotremove operations. + * Protected by slab_mutex. + */ +static nodemask_t slab_nodes; + +/* + * Workqueue used for flush_cpu_slab(). + */ +static struct workqueue_struct *flushwq; + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +/* + * Returns freelist pointer (ptr). With hardening, this is obfuscated + * with an XOR of the address where the pointer is held and a per-cache + * random number. + */ +static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, + unsigned long ptr_addr) +{ +#ifdef CONFIG_SLAB_FREELIST_HARDENED + /* + * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged. + * Normally, this doesn't cause any issues, as both set_freepointer() + * and get_freepointer() are called with a pointer with the same tag. + * However, there are some issues with CONFIG_SLUB_DEBUG code. For + * example, when __free_slub() iterates over objects in a cache, it + * passes untagged pointers to check_object(). check_object() in turns + * calls get_freepointer() with an untagged pointer, which causes the + * freepointer to be restored incorrectly. + */ + return (void *)((unsigned long)ptr ^ s->random ^ + swab((unsigned long)kasan_reset_tag((void *)ptr_addr))); +#else + return ptr; +#endif +} + +/* Returns the freelist pointer recorded at location ptr_addr. */ +static inline void *freelist_dereference(const struct kmem_cache *s, + void *ptr_addr) +{ + return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), + (unsigned long)ptr_addr); +} + +static inline void *get_freepointer(struct kmem_cache *s, void *object) +{ + object = kasan_reset_tag(object); + return freelist_dereference(s, object + s->offset); +} + +static void prefetch_freepointer(const struct kmem_cache *s, void *object) +{ + prefetchw(object + s->offset); +} + +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks +static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) +{ + unsigned long freepointer_addr; + void *p; + + if (!debug_pagealloc_enabled_static()) + return get_freepointer(s, object); + + object = kasan_reset_tag(object); + freepointer_addr = (unsigned long)object + s->offset; + copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); + return freelist_ptr(s, p, freepointer_addr); +} + +static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) +{ + unsigned long freeptr_addr = (unsigned long)object + s->offset; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + BUG_ON(object == fp); /* naive detection of double free or corruption */ +#endif + + freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); +} + +/* Loop over all objects in a slab */ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) + +static inline unsigned int order_objects(unsigned int order, unsigned int size) +{ + return ((unsigned int)PAGE_SIZE << order) / size; +} + +static inline struct kmem_cache_order_objects oo_make(unsigned int order, + unsigned int size) +{ + struct kmem_cache_order_objects x = { + (order << OO_SHIFT) + order_objects(order, size) + }; + + return x; +} + +static inline unsigned int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> OO_SHIFT; +} + +static inline unsigned int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & OO_MASK; +} + +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ + unsigned int nr_slabs; + + s->cpu_partial = nr_objects; + + /* + * We take the number of objects but actually limit the number of + * slabs on the per cpu partial list, in order to limit excessive + * growth of the list. For simplicity we assume that the slabs will + * be half-full. + */ + nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); + s->cpu_partial_slabs = nr_slabs; +} +#else +static inline void +slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ +} +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct slab *slab) +{ + struct page *page = slab_page(slab); + + VM_BUG_ON_PAGE(PageTail(page), page); + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct slab *slab) +{ + struct page *page = slab_page(slab); + + VM_BUG_ON_PAGE(PageTail(page), page); + __bit_spin_unlock(PG_locked, &page->flags); +} + +/* + * Interrupts must be disabled (for the fallback code to work right), typically + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. + */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + if (USE_LOCKLESS_FAST_PATH()) + lockdep_assert_irqs_disabled(); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&slab->freelist, &slab->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return true; + } else +#endif + { + slab_lock(slab); + if (slab->freelist == freelist_old && + slab->counters == counters_old) { + slab->freelist = freelist_new; + slab->counters = counters_new; + slab_unlock(slab); + return true; + } + slab_unlock(slab); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return false; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&slab->freelist, &slab->counters, + freelist_old, counters_old, + freelist_new, counters_new)) + return true; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(slab); + if (slab->freelist == freelist_old && + slab->counters == counters_old) { + slab->freelist = freelist_new; + slab->counters = counters_new; + slab_unlock(slab); + local_irq_restore(flags); + return true; + } + slab_unlock(slab); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + pr_info("%s %s: cmpxchg double redo ", n, s->name); +#endif + + return false; +} + +#ifdef CONFIG_SLUB_DEBUG +static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +static DEFINE_SPINLOCK(object_map_lock); + +static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, + struct slab *slab) +{ + void *addr = slab_address(slab); + void *p; + + bitmap_zero(obj_map, slab->objects); + + for (p = slab->freelist; p; p = get_freepointer(s, p)) + set_bit(__obj_to_index(s, addr, p), obj_map); +} + +#if IS_ENABLED(CONFIG_KUNIT) +static bool slab_add_kunit_errors(void) +{ + struct kunit_resource *resource; + + if (likely(!current->kunit_test)) + return false; + + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; + + (*(int *)resource->data)++; + kunit_put_resource(resource); + return true; +} +#else +static inline bool slab_add_kunit_errors(void) { return false; } +#endif + +static inline unsigned int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + +/* + * Debug settings: + */ +#if defined(CONFIG_SLUB_DEBUG_ON) +static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; +#else +static slab_flags_t slub_debug; +#endif + +static char *slub_debug_string; +static int disable_higher_order_debug; + +/* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kasan_enable_current(); +} + +/* + * Object debugging + */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct slab *slab, void *object) +{ + void *base; + + if (!object) + return 1; + + base = slab_address(slab); + object = kasan_reset_tag(object); + object = restore_red_left(s, object); + if (object < base || object >= base + slab->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +static void print_section(char *level, char *text, u8 *addr, + unsigned int length) +{ + metadata_access_enable(); + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + 16, 1, kasan_reset_tag((void *)addr), length, 1); + metadata_access_disable(); +} + +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} + +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + struct track *p; + + p = object + get_info_end(s); + + return kasan_reset_tag(p + alloc); +} + +#ifdef CONFIG_STACKDEPOT +static noinline depot_stack_handle_t set_track_prepare(void) +{ + depot_stack_handle_t handle; + unsigned long entries[TRACK_ADDRS_COUNT]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + + return handle; +} +#else +static inline depot_stack_handle_t set_track_prepare(void) +{ + return 0; +} +#endif + +static void set_track_update(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, + depot_stack_handle_t handle) +{ + struct track *p = get_track(s, object, alloc); + +#ifdef CONFIG_STACKDEPOT + p->handle = handle; +#endif + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; +} + +static __always_inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) +{ + depot_stack_handle_t handle = set_track_prepare(); + + set_track_update(s, object, alloc, addr, handle); +} + +static void init_tracking(struct kmem_cache *s, void *object) +{ + struct track *p; + + if (!(s->flags & SLAB_STORE_USER)) + return; + + p = get_track(s, object, TRACK_ALLOC); + memset(p, 0, 2*sizeof(struct track)); +} + +static void print_track(const char *s, struct track *t, unsigned long pr_time) +{ + depot_stack_handle_t handle __maybe_unused; + + if (!t->addr) + return; + + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(t->handle); + if (handle) + stack_depot_print(handle); + else + pr_err("object allocation/free stack trace missing\n"); +#endif +} + +void print_tracking(struct kmem_cache *s, void *object) +{ + unsigned long pr_time = jiffies; + if (!(s->flags & SLAB_STORE_USER)) + return; + + print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time); + print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); +} + +static void print_slab_info(const struct slab *slab) +{ + struct folio *folio = (struct folio *)slab_folio(slab); + + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", + slab, slab->objects, slab->inuse, slab->freelist, + folio_flags(folio, 0)); +} + +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + +static void slab_bug(struct kmem_cache *s, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); + va_end(args); +} + +__printf(2, 3) +static void slab_fix(struct kmem_cache *s, char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (slab_add_kunit_errors()) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); + va_end(args); +} + +static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) +{ + unsigned int off; /* Offset of last byte */ + u8 *addr = slab_address(slab); + + print_tracking(s, p); + + print_slab_info(slab); + + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); + + if (s->flags & SLAB_RED_ZONE) + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + s->red_left_pad); + else if (p > addr + 16) + print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); + + print_section(KERN_ERR, "Object ", p, + min_t(unsigned int, s->object_size, PAGE_SIZE)); + if (s->flags & SLAB_RED_ZONE) + print_section(KERN_ERR, "Redzone ", p + s->object_size, + s->inuse - s->object_size); + + off = get_info_end(s); + + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + + off += kasan_metadata_size(s); + + if (off != size_from_object(s)) + /* Beginning of the filler is the free pointer */ + print_section(KERN_ERR, "Padding ", p + off, + size_from_object(s) - off); + + dump_stack(); +} + +static void object_err(struct kmem_cache *s, struct slab *slab, + u8 *object, char *reason) +{ + if (slab_add_kunit_errors()) + return; + + slab_bug(s, "%s", reason); + print_trailer(s, slab, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} + +static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, + void **freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, slab, nextfree) && freelist) { + object_err(s, slab, *freelist, "Freechain corrupt"); + *freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, + const char *fmt, ...) +{ + va_list args; + char buf[100]; + + if (slab_add_kunit_errors()) + return; + + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + slab_bug(s, "%s", buf); + print_slab_info(slab); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +} + +static void init_object(struct kmem_cache *s, void *object, u8 val) +{ + u8 *p = kasan_reset_tag(object); + + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->object_size - 1); + p[s->object_size - 1] = POISON_END; + } + + if (s->flags & SLAB_RED_ZONE) + memset(p + s->object_size, val, s->inuse - s->object_size); +} + +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); + memset(from, data, to - from); +} + +static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, + u8 *object, char *what, + u8 *start, unsigned int value, unsigned int bytes) +{ + u8 *fault; + u8 *end; + u8 *addr = slab_address(slab); + + metadata_access_enable(); + fault = memchr_inv(kasan_reset_tag(start), value, bytes); + metadata_access_disable(); + if (!fault) + return 1; + + end = start + bytes; + while (end > fault && end[-1] == value) + end--; + + if (slab_add_kunit_errors()) + goto skip_bug_print; + + slab_bug(s, "%s overwritten", what); + pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault - addr, + fault[0], value); + print_trailer(s, slab, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + +skip_bug_print: + restore_bytes(s, what, value, fault, end); + return 0; +} + +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is at the middle of the object. + * + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->object_size + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended by another word if Redzoning is enabled and + * object_size == inuse. + * + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * Meta data starts here. + * + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum + * one word if debugging is on to be able to detect writes + * before the word boundary. + * + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * Nothing is used beyond s->size. + * + * If slabcaches are merged then the object_size and inuse boundaries are mostly + * ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ + +static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) +{ + unsigned long off = get_info_end(s); /* The end of info */ + + if (s->flags & SLAB_STORE_USER) { + /* We also have user information there */ + off += 2 * sizeof(struct track); + + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + + off += kasan_metadata_size(s); + + if (size_from_object(s) == off) + return 1; + + return check_bytes_and_report(s, slab, p, "Object padding", + p + off, POISON_INUSE, size_from_object(s) - off); +} + +/* Check the pad bytes at the end of a slab page */ +static void slab_pad_check(struct kmem_cache *s, struct slab *slab) +{ + u8 *start; + u8 *fault; + u8 *end; + u8 *pad; + int length; + int remainder; + + if (!(s->flags & SLAB_POISON)) + return; + + start = slab_address(slab); + length = slab_size(slab); + end = start + length; + remainder = length % s->size; + if (!remainder) + return; + + pad = end - remainder; + metadata_access_enable(); + fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); + metadata_access_disable(); + if (!fault) + return; + while (end > fault && end[-1] == POISON_INUSE) + end--; + + slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); + print_section(KERN_ERR, "Padding ", pad, remainder); + + restore_bytes(s, "slab padding", POISON_INUSE, fault, end); +} + +static int check_object(struct kmem_cache *s, struct slab *slab, + void *object, u8 val) +{ + u8 *p = object; + u8 *endobject = object + s->object_size; + + if (s->flags & SLAB_RED_ZONE) { + if (!check_bytes_and_report(s, slab, object, "Left Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, slab, object, "Right Redzone", + endobject, val, s->inuse - s->object_size)) + return 0; + } else { + if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { + check_bytes_and_report(s, slab, p, "Alignment padding", + endobject, POISON_INUSE, + s->inuse - s->object_size); + } + } + + if (s->flags & SLAB_POISON) { + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && + (!check_bytes_and_report(s, slab, p, "Poison", p, + POISON_FREE, s->object_size - 1) || + !check_bytes_and_report(s, slab, p, "End Poison", + p + s->object_size - 1, POISON_END, 1))) + return 0; + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, slab, p); + } + + if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; + + /* Check free pointer validity */ + if (!check_valid_pointer(s, slab, get_freepointer(s, p))) { + object_err(s, slab, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus lose the remainder + * of the free objects in this slab. May cause + * another error because the object count is now wrong. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} + +static int check_slab(struct kmem_cache *s, struct slab *slab) +{ + int maxobj; + + if (!folio_test_slab(slab_folio(slab))) { + slab_err(s, slab, "Not a valid slab page"); + return 0; + } + + maxobj = order_objects(slab_order(slab), s->size); + if (slab->objects > maxobj) { + slab_err(s, slab, "objects %u > max %u", + slab->objects, maxobj); + return 0; + } + if (slab->inuse > slab->objects) { + slab_err(s, slab, "inuse %u > max %u", + slab->inuse, slab->objects); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, slab); + return 1; +} + +/* + * Determine if a certain object in a slab is on the freelist. Must hold the + * slab lock to guarantee that the chains are in a consistent state. + */ +static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +{ + int nr = 0; + void *fp; + void *object = NULL; + int max_objects; + + fp = slab->freelist; + while (fp && nr <= slab->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, slab, fp)) { + if (object) { + object_err(s, slab, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + } else { + slab_err(s, slab, "Freepointer corrupt"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } + + max_objects = order_objects(slab_order(slab), s->size); + if (max_objects > MAX_OBJS_PER_PAGE) + max_objects = MAX_OBJS_PER_PAGE; + + if (slab->objects != max_objects) { + slab_err(s, slab, "Wrong number of objects. Found %d but should be %d", + slab->objects, max_objects); + slab->objects = max_objects; + slab_fix(s, "Number of objects adjusted"); + } + if (slab->inuse != slab->objects - nr) { + slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d", + slab->inuse, slab->objects - nr); + slab->inuse = slab->objects - nr; + slab_fix(s, "Object count adjusted"); + } + return search == NULL; +} + +static void trace(struct kmem_cache *s, struct slab *slab, void *object, + int alloc) +{ + if (s->flags & SLAB_TRACE) { + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + s->name, + alloc ? "alloc" : "free", + object, slab->inuse, + slab->freelist); + + if (!alloc) + print_section(KERN_INFO, "Object ", (void *)object, + s->object_size); + + dump_stack(); + } +} + +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + lockdep_assert_held(&n->list_lock); + list_add(&slab->slab_list, &n->full); +} + +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) +{ + if (!(s->flags & SLAB_STORE_USER)) + return; + + lockdep_assert_held(&n->list_lock); + list_del(&slab->slab_list); +} + +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (likely(n)) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ +static void setup_object_debug(struct kmem_cache *s, void *object) +{ + if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) + return; + + init_object(s, object, SLUB_RED_INACTIVE); + init_tracking(s, object); +} + +static +void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) +{ + if (!kmem_cache_debug_flags(s, SLAB_POISON)) + return; + + metadata_access_enable(); + memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab)); + metadata_access_disable(); +} + +static inline int alloc_consistency_checks(struct kmem_cache *s, + struct slab *slab, void *object) +{ + if (!check_slab(s, slab)) + return 0; + + if (!check_valid_pointer(s, slab, object)) { + object_err(s, slab, object, "Freelist Pointer check fails"); + return 0; + } + + if (!check_object(s, slab, object, SLUB_RED_INACTIVE)) + return 0; + + return 1; +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) +{ + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, slab, object)) + goto bad; + } + + /* Success. Perform special debug activities for allocs */ + trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); + init_object(s, object, SLUB_RED_ACTIVE); + return 1; + +bad: + if (folio_test_slab(slab_folio(slab))) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + } + return 0; +} + +static inline int free_consistency_checks(struct kmem_cache *s, + struct slab *slab, void *object, unsigned long addr) +{ + if (!check_valid_pointer(s, slab, object)) { + slab_err(s, slab, "Invalid object pointer 0x%p", object); + return 0; + } + + if (on_freelist(s, slab, object)) { + object_err(s, slab, object, "Object already free"); + return 0; + } + + if (!check_object(s, slab, object, SLUB_RED_ACTIVE)) + return 0; + + if (unlikely(s != slab->slab_cache)) { + if (!folio_test_slab(slab_folio(slab))) { + slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", + object); + } else if (!slab->slab_cache) { + pr_err("SLUB : no slab for object 0x%p.\n", + object); + dump_stack(); + } else + object_err(s, slab, object, + "page slab pointer corrupt."); + return 0; + } + return 1; +} + +/* + * Parse a block of slub_debug options. Blocks are delimited by ';' + * + * @str: start of block + * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified + * @slabs: return start of list of slabs, or NULL when there's no list + * @init: assume this is initial parsing and not per-kmem-create parsing + * + * returns the start of next block if there's any, or NULL + */ +static char * +parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) +{ + bool higher_order_disable = false; + + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (*str == ',') { + /* + * No options but restriction on slabs. This means full + * debugging for slabs matching a pattern. + */ + *flags = DEBUG_DEFAULT_FLAGS; + goto check_slabs; + } + *flags = 0; + + /* Determine which debug features should be switched on */ + for (; *str && *str != ',' && *str != ';'; str++) { + switch (tolower(*str)) { + case '-': + *flags = 0; + break; + case 'f': + *flags |= SLAB_CONSISTENCY_CHECKS; + break; + case 'z': + *flags |= SLAB_RED_ZONE; + break; + case 'p': + *flags |= SLAB_POISON; + break; + case 'u': + *flags |= SLAB_STORE_USER; + break; + case 't': + *flags |= SLAB_TRACE; + break; + case 'a': + *flags |= SLAB_FAILSLAB; + break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + higher_order_disable = true; + break; + default: + if (init) + pr_err("slub_debug option '%c' unknown. skipped\n", *str); + } + } +check_slabs: + if (*str == ',') + *slabs = ++str; + else + *slabs = NULL; + + /* Skip over the slab list */ + while (*str && *str != ';') + str++; + + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (init && higher_order_disable) + disable_higher_order_debug = 1; + + if (*str) + return str; + else + return NULL; +} + +static int __init setup_slub_debug(char *str) +{ + slab_flags_t flags; + slab_flags_t global_flags; + char *saved_str; + char *slab_list; + bool global_slub_debug_changed = false; + bool slab_list_specified = false; + + global_flags = DEBUG_DEFAULT_FLAGS; + if (*str++ != '=' || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + saved_str = str; + while (str) { + str = parse_slub_debug_flags(str, &flags, &slab_list, true); + + if (!slab_list) { + global_flags = flags; + global_slub_debug_changed = true; + } else { + slab_list_specified = true; + if (flags & SLAB_STORE_USER) + stack_depot_want_early_init(); + } + } + + /* + * For backwards compatibility, a single list of flags with list of + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as + * long as there is no option specifying flags without a slab list. + */ + if (slab_list_specified) { + if (!global_slub_debug_changed) + global_flags = slub_debug; + slub_debug_string = saved_str; + } +out: + slub_debug = global_flags; + if (slub_debug & SLAB_STORE_USER) + stack_depot_want_early_init(); + if (slub_debug != 0 || slub_debug_string) + static_branch_enable(&slub_debug_enabled); + else + static_branch_disable(&slub_debug_enabled); + if ((static_branch_unlikely(&init_on_alloc) || + static_branch_unlikely(&init_on_free)) && + (slub_debug & SLAB_POISON)) + pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); + return 1; +} + +__setup("slub_debug", setup_slub_debug); + +/* + * kmem_cache_flags - apply debugging options to the cache + * @object_size: the size of an object without meta data + * @flags: flags to set + * @name: name of the cache + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slub_debug=,, ... + * then only the select slabs will receive the debug option(s). + */ +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name) +{ + char *iter; + size_t len; + char *next_block; + slab_flags_t block_flags; + slab_flags_t slub_debug_local = slub_debug; + + if (flags & SLAB_NO_USER_FLAGS) + return flags; + + /* + * If the slab cache is for debugging (e.g. kmemleak) then + * don't store user (stack trace) information by default, + * but let the user enable it via the command line below. + */ + if (flags & SLAB_NOLEAKTRACE) + slub_debug_local &= ~SLAB_STORE_USER; + + len = strlen(name); + next_block = slub_debug_string; + /* Go through all blocks of debug options, see if any matches our slab's name */ + while (next_block) { + next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false); + if (!iter) + continue; + /* Found a block that has a slab list, search it */ + while (*iter) { + char *end, *glob; + size_t cmplen; + + end = strchrnul(iter, ','); + if (next_block && next_block < end) + end = next_block - 1; + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); + + if (!strncmp(name, iter, cmplen)) { + flags |= block_flags; + return flags; + } + + if (!*end || *end == ';') + break; + iter = end + 1; + } + } + + return flags | slub_debug_local; +} +#else /* !CONFIG_SLUB_DEBUG */ +static inline void setup_object_debug(struct kmem_cache *s, void *object) {} +static inline +void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} + +static inline int alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return 0; } + +static inline void free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) {} + +static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} +static inline int check_object(struct kmem_cache *s, struct slab *slab, + void *object, u8 val) { return 1; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) {} +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct slab *slab) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct slab *slab) {} +slab_flags_t kmem_cache_flags(unsigned int object_size, + slab_flags_t flags, const char *name) +{ + return flags; +} +#define slub_debug 0 + +#define disable_higher_order_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} + +static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, + void **freelist, void *nextfree) +{ + return false; +} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static __always_inline bool slab_free_hook(struct kmem_cache *s, + void *x, bool init) +{ + kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); + + debug_check_no_locks_freed(x, s->object_size); + + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); + + /* Use KCSAN to help debug racy use-after-free. */ + if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + __kcsan_check_access(x, s->object_size, + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset's must be + * kept together to avoid discrepancies in behavior. + * + * The initialization memset's clear the object and the metadata, + * but don't touch the SLAB redzone. + */ + if (init) { + int rsize; + + if (!kasan_has_integrated_init()) + memset(kasan_reset_tag(x), 0, s->object_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + memset((char *)kasan_reset_tag(x) + s->inuse, 0, + s->size - s->inuse - rsize); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ + return kasan_slab_free(s, x, init); +} + +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail, + int *cnt) +{ + + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + + if (is_kfence_address(next)) { + slab_free_hook(s, next, false); + return true; + } + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; + + do { + object = next; + next = get_freepointer(s, object); + + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } else { + /* + * Adjust the reconstructed freelist depth + * accordingly if object's reuse is delayed. + */ + --(*cnt); + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; +} + +static void *setup_object(struct kmem_cache *s, void *object) +{ + setup_object_debug(s, object); + object = kasan_init_slab_obj(s, object); + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); + } + return object; +} + +/* + * Slab allocation and freeing + */ +static inline struct slab *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + struct folio *folio; + struct slab *slab; + unsigned int order = oo_order(oo); + + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_pages(flags, order); + else + folio = (struct folio *)__alloc_pages_node(node, flags, order); + + if (!folio) + return NULL; + + slab = folio_slab(folio); + __folio_set_slab(folio); + if (page_is_pfmemalloc(folio_page(folio, 0))) + slab_set_pfmemalloc(slab); + + return slab; +} + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Pre-initialize the random sequence cache */ +static int init_cache_random_seq(struct kmem_cache *s) +{ + unsigned int count = oo_objects(s->oo); + int err; + + /* Bailout if already initialised */ + if (s->random_seq) + return 0; + + err = cache_random_seq_create(s, count, GFP_KERNEL); + if (err) { + pr_err("SLUB: Unable to initialize free list for %s\n", + s->name); + return err; + } + + /* Transform to an offset on the set of pages */ + if (s->random_seq) { + unsigned int i; + + for (i = 0; i < count; i++) + s->random_seq[i] *= s->size; + } + return 0; +} + +/* Initialize each random sequence freelist per cache */ +static void __init init_freelist_randomization(void) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) + init_cache_random_seq(s); + + mutex_unlock(&slab_mutex); +} + +/* Get the next entry on the pre-computed freelist randomized */ +static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, + unsigned long *pos, void *start, + unsigned long page_limit, + unsigned long freelist_count) +{ + unsigned int idx; + + /* + * If the target page allocation failed, the number of objects on the + * page might be smaller than the usual size defined by the cache. + */ + do { + idx = s->random_seq[*pos]; + *pos += 1; + if (*pos >= freelist_count) + *pos = 0; + } while (unlikely(idx >= page_limit)); + + return (char *)start + idx; +} + +/* Shuffle the single linked freelist based on a random pre-computed sequence */ +static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) +{ + void *start; + void *cur; + void *next; + unsigned long idx, pos, page_limit, freelist_count; + + if (slab->objects < 2 || !s->random_seq) + return false; + + freelist_count = oo_objects(s->oo); + pos = prandom_u32_max(freelist_count); + + page_limit = slab->objects * s->size; + start = fixup_red_left(s, slab_address(slab)); + + /* First entry is used as the base of the freelist */ + cur = next_freelist_entry(s, slab, &pos, start, page_limit, + freelist_count); + cur = setup_object(s, cur); + slab->freelist = cur; + + for (idx = 1; idx < slab->objects; idx++) { + next = next_freelist_entry(s, slab, &pos, start, page_limit, + freelist_count); + next = setup_object(s, next); + set_freepointer(s, cur, next); + cur = next; + } + set_freepointer(s, cur, NULL); + + return true; +} +#else +static inline int init_cache_random_seq(struct kmem_cache *s) +{ + return 0; +} +static inline void init_freelist_randomization(void) { } +static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct slab *slab; + struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; + void *start, *p, *next; + int idx; + bool shuffle; + + flags &= gfp_allowed_mask; + + flags |= s->allocflags; + + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; + + slab = alloc_slab_page(alloc_gfp, node, oo); + if (unlikely(!slab)) { + oo = s->min; + alloc_gfp = flags; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + slab = alloc_slab_page(alloc_gfp, node, oo); + if (unlikely(!slab)) + return NULL; + stat(s, ORDER_FALLBACK); + } + + slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; + + account_slab(slab, oo_order(oo), s, flags); + + slab->slab_cache = s; + + kasan_poison_slab(slab); + + start = slab_address(slab); + + setup_slab_debug(s, slab, start); + + shuffle = shuffle_freelist(s, slab); + + if (!shuffle) { + start = fixup_red_left(s, start); + start = setup_object(s, start); + slab->freelist = start; + for (idx = 0, p = start; idx < slab->objects - 1; idx++) { + next = p + s->size; + next = setup_object(s, next); + set_freepointer(s, p, next); + p = next; + } + set_freepointer(s, p, NULL); + } + + return slab; +} + +static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + +static void __free_slab(struct kmem_cache *s, struct slab *slab) +{ + struct folio *folio = slab_folio(slab); + int order = folio_order(folio); + int pages = 1 << order; + + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + __slab_clear_pfmemalloc(slab); + __folio_clear_slab(folio); + folio->mapping = NULL; + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; + unaccount_slab(slab, order, s); + __free_pages(folio_page(folio, 0), order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct slab *slab = container_of(h, struct slab, rcu_head); + + __free_slab(slab->slab_cache, slab); +} + +static void free_slab(struct kmem_cache *s, struct slab *slab) +{ + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + call_rcu(&slab->rcu_head, rcu_free_slab); + } else + __free_slab(s, slab); +} + +static void discard_slab(struct kmem_cache *s, struct slab *slab) +{ + dec_slabs_node(s, slab_nid(slab), slab->objects); + free_slab(s, slab); +} + +/* + * Management of partially allocated slabs. + */ +static inline void +__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) +{ + n->nr_partial++; + if (tail == DEACTIVATE_TO_TAIL) + list_add_tail(&slab->slab_list, &n->partial); + else + list_add(&slab->slab_list, &n->partial); +} + +static inline void add_partial(struct kmem_cache_node *n, + struct slab *slab, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, slab, tail); +} + +static inline void remove_partial(struct kmem_cache_node *n, + struct slab *slab) +{ + lockdep_assert_held(&n->list_lock); + list_del(&slab->slab_list); + n->nr_partial--; +} + +/* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, int orig_size) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + remove_partial(n, slab); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, + struct slab *slab, int orig_size) +{ + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object, orig_size)) + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + */ + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + +/* + * Remove slab from the partial list, freeze it and + * return the pointer to the freelist. + * + * Returns a list of objects or NULL if it fails. + */ +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, + int mode) +{ + void *freelist; + unsigned long counters; + struct slab new; + + lockdep_assert_held(&n->list_lock); + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + freelist = slab->freelist; + counters = slab->counters; + new.counters = counters; + if (mode) { + new.inuse = slab->objects; + new.freelist = NULL; + } else { + new.freelist = freelist; + } + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + if (!__cmpxchg_double_slab(s, slab, + freelist, counters, + new.freelist, new.counters, + "acquire_slab")) + return NULL; + + remove_partial(n, slab); + WARN_ON(!freelist); + return freelist; +} + +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); +#else +static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, + int drain) { } +#endif +static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); + +/* + * Try to allocate a partial slab from a specific node. + */ +static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + struct partial_context *pc) +{ + struct slab *slab, *slab2; + void *object = NULL; + unsigned long flags; + unsigned int partial_slabs = 0; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab and there is none available then get_partial() + * will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + void *t; + + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + if (kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab, + pc->orig_size); + if (object) + break; + continue; + } + + t = acquire_slab(s, n, slab, object == NULL); + if (!t) + break; + + if (!object) { + *pc->slab = slab; + stat(s, ALLOC_FROM_PARTIAL); + object = t; + } else { + put_cpu_partial(s, slab, 0); + stat(s, CPU_PARTIAL_NODE); + partial_slabs++; + } +#ifdef CONFIG_SLUB_CPU_PARTIAL + if (!kmem_cache_has_cpu_partial(s) + || partial_slabs > s->cpu_partial_slabs / 2) + break; +#else + break; +#endif + + } + spin_unlock_irqrestore(&n->list_lock, flags); + return object; +} + +/* + * Get a slab from somewhere. Search in increasing NUMA distances. + */ +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(pc->flags); + void *object; + unsigned int cpuset_mems_cookie; + + /* + * The defrag ratio allows a configuration of the tradeoffs between + * inter node defragmentation and node local allocations. A lower + * defrag_ratio increases the tendency to do local allocations + * instead of attempting to obtain partial slabs from other nodes. + * + * If the defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If the ratio is higher then kmalloc() + * may return off node objects because partial slabs are obtained + * from other nodes and filled up. + * + * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 + * (which makes defrag_ratio = 1000) then every (well almost) + * allocation will first attempt to defrag slab caches on other nodes. + * This means scanning over all nodes to look for partial slabs which + * may be expensive if we do it every time we are trying to find a slab + * with available objects. + */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return NULL; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed(zone, pc->flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, pc); + if (object) { + /* + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update + */ + return object; + } + } + } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); +#endif /* CONFIG_NUMA */ + return NULL; +} + +/* + * Get a partial slab, lock it and return it. + */ +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) +{ + void *object; + int searchnode = node; + + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + + object = get_partial_node(s, get_node(s, searchnode), pc); + if (object || node != NUMA_NO_NODE) + return object; + + return get_any_partial(s, pc); +} + +#ifdef CONFIG_PREEMPTION +/* + * Calculate the next globally unique transaction for disambiguation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. + */ +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +#ifdef SLUB_DEBUG_CMPXCHG +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} +#endif + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + pr_info("%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPTION + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + pr_warn("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + pr_warn("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); +} + +static void init_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + struct kmem_cache_cpu *c; + + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(s->cpu_slab, cpu); + local_lock_init(&c->lock); + c->tid = init_tid(cpu); + } +} + +/* + * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, + * unfreezes the slabs and puts it on the proper list. + * Assumes the slab has been already safely taken away from kmem_cache_cpu + * by the caller. + */ +static void deactivate_slab(struct kmem_cache *s, struct slab *slab, + void *freelist) +{ + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + int free_delta = 0; + enum slab_modes mode = M_NONE; + void *nextfree, *freelist_iter, *freelist_tail; + int tail = DEACTIVATE_TO_HEAD; + unsigned long flags = 0; + struct slab new; + struct slab old; + + if (slab->freelist) { + stat(s, DEACTIVATE_REMOTE_FREES); + tail = DEACTIVATE_TO_TAIL; + } + + /* + * Stage one: Count the objects on cpu's freelist as free_delta and + * remember the last object in freelist_tail for later splicing. + */ + freelist_tail = NULL; + freelist_iter = freelist; + while (freelist_iter) { + nextfree = get_freepointer(s, freelist_iter); + + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist_iter' is already corrupted. So isolate all objects + * starting at 'freelist_iter' by skipping them. + */ + if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) + break; + + freelist_tail = freelist_iter; + free_delta++; + + freelist_iter = nextfree; + } + + /* + * Stage two: Unfreeze the slab while splicing the per-cpu + * freelist to the head of slab's freelist. + * + * Ensure that the slab is unfrozen while the list presence + * reflects the actual number of objects during unfreeze. + * + * We first perform cmpxchg holding lock and insert to list + * when it succeed. If there is mismatch then the slab is not + * unfrozen and number of objects in the slab may have changed. + * Then release lock and retry cmpxchg again. + */ +redo: + + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && n->nr_partial >= s->min_partial) { + mode = M_FREE; + } else if (new.freelist) { + mode = M_PARTIAL; + /* + * Taking the spinlock removes the possibility that + * acquire_slab() will see a slab that is frozen + */ + spin_lock_irqsave(&n->list_lock, flags); + } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { + mode = M_FULL; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + } else { + mode = M_FULL_NOLIST; + } + + + if (!cmpxchg_double_slab(s, slab, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) { + if (mode == M_PARTIAL || mode == M_FULL) + spin_unlock_irqrestore(&n->list_lock, flags); + goto redo; + } + + + if (mode == M_PARTIAL) { + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, tail); + } else if (mode == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, slab); + stat(s, FREE_SLAB); + } else if (mode == M_FULL) { + add_full(s, n, slab); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, DEACTIVATE_FULL); + } else if (mode == M_FULL_NOLIST) { + stat(s, DEACTIVATE_FULL); + } +} + +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) +{ + struct kmem_cache_node *n = NULL, *n2 = NULL; + struct slab *slab, *slab_to_discard = NULL; + unsigned long flags = 0; + + while (partial_slab) { + struct slab new; + struct slab old; + + slab = partial_slab; + partial_slab = slab->next; + + n2 = get_node(s, slab_nid(slab)); + if (n != n2) { + if (n) + spin_unlock_irqrestore(&n->list_lock, flags); + + n = n2; + spin_lock_irqsave(&n->list_lock, flags); + } + + do { + + old.freelist = slab->freelist; + old.counters = slab->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + } while (!__cmpxchg_double_slab(s, slab, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + slab->next = slab_to_discard; + slab_to_discard = slab; + } else { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } + + if (n) + spin_unlock_irqrestore(&n->list_lock, flags); + + while (slab_to_discard) { + slab = slab_to_discard; + slab_to_discard = slab_to_discard->next; + + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, slab); + stat(s, FREE_SLAB); + } +} + +/* + * Unfreeze all the cpu partial slabs. + */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct slab *partial_slab; + unsigned long flags; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + partial_slab = this_cpu_read(s->cpu_slab->partial); + this_cpu_write(s->cpu_slab->partial, NULL); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (partial_slab) + __unfreeze_partials(s, partial_slab); +} + +static void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + struct slab *partial_slab; + + partial_slab = slub_percpu_partial(c); + c->partial = NULL; + + if (partial_slab) + __unfreeze_partials(s, partial_slab); +} + +/* + * Put a slab that was just frozen (in __slab_free|get_partial_node) into a + * partial slab slot if available. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) +{ + struct slab *oldslab; + struct slab *slab_to_unfreeze = NULL; + unsigned long flags; + int slabs = 0; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + + oldslab = this_cpu_read(s->cpu_slab->partial); + + if (oldslab) { + if (drain && oldslab->slabs >= s->cpu_partial_slabs) { + /* + * Partial array is full. Move the existing set to the + * per node partial list. Postpone the actual unfreezing + * outside of the critical section. + */ + slab_to_unfreeze = oldslab; + oldslab = NULL; + } else { + slabs = oldslab->slabs; + } + } + + slabs++; + + slab->slabs = slabs; + slab->next = oldslab; + + this_cpu_write(s->cpu_slab->partial, slab); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (slab_to_unfreeze) { + __unfreeze_partials(s, slab_to_unfreeze); + stat(s, CPU_PARTIAL_DRAIN); + } +} + +#else /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void unfreeze_partials(struct kmem_cache *s) { } +static inline void unfreeze_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } + +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +{ + unsigned long flags; + struct slab *slab; + void *freelist; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + + slab = c->slab; + freelist = c->freelist; + + c->slab = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (slab) { + deactivate_slab(s, slab, freelist); + stat(s, CPUSLAB_FLUSH); + } +} + +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + void *freelist = c->freelist; + struct slab *slab = c->slab; + + c->slab = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + if (slab) { + deactivate_slab(s, slab, freelist); + stat(s, CPUSLAB_FLUSH); + } + + unfreeze_partials_cpu(s, c); +} + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +/* + * Flush cpu slab. + * + * Called from CPU work handler with migration disabled. + */ +static void flush_cpu_slab(struct work_struct *w) +{ + struct kmem_cache *s; + struct kmem_cache_cpu *c; + struct slub_flush_work *sfw; + + sfw = container_of(w, struct slub_flush_work, work); + + s = sfw->s; + c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + unfreeze_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); + +static void flush_all_cpus_locked(struct kmem_cache *s) +{ + struct slub_flush_work *sfw; + unsigned int cpu; + + lockdep_assert_cpus_held(); + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (!has_cpu_slab(cpu, s)) { + sfw->skip = true; + continue; + } + INIT_WORK(&sfw->work, flush_cpu_slab); + sfw->skip = false; + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (sfw->skip) + continue; + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); +} + +static void flush_all(struct kmem_cache *s) +{ + cpus_read_lock(); + flush_all_cpus_locked(s); + cpus_read_unlock(); +} + +/* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int slub_cpu_dead(unsigned int cpu) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) + __flush_cpu_slab(s, cpu); + mutex_unlock(&slab_mutex); + return 0; +} + +/* + * Check if the objects in a per cpu structure fit numa + * locality expectations. + */ +static inline int node_match(struct slab *slab, int node) +{ +#ifdef CONFIG_NUMA + if (node != NUMA_NO_NODE && slab_nid(slab) != node) + return 0; +#endif + return 1; +} + +#ifdef CONFIG_SLUB_DEBUG +static int count_free(struct slab *slab) +{ + return slab->objects - slab->inuse; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} + +/* Supports checking bulk free of a constructed freelist */ +static noinline void free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + void *object = head; + int cnt = 0; + unsigned long flags; + bool checks_ok = false; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != bulk_cnt) + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + bulk_cnt, cnt); + +out: + if (checks_ok) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct slab *)) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(slab, &n->partial, slab_list) + x += get_count(slab); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ + +static noinline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) +{ +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + int node; + struct kmem_cache_node *n; + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + + pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); + + if (oo_order(s->min) > get_order(s->object_size)) + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); + + for_each_kmem_cache_node(s, node, n) { + unsigned long nr_slabs; + unsigned long nr_objs; + unsigned long nr_free; + + nr_free = count_partial(n, count_free); + nr_slabs = node_nr_slabs(n); + nr_objs = node_nr_objs(n); + + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", + node, nr_slabs, nr_objs, nr_free); + } +#endif +} + +static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) +{ + if (unlikely(slab_test_pfmemalloc(slab))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + +/* + * Check the slab->freelist and either transfer the freelist to the + * per cpu freelist or deactivate the slab. + * + * The slab is still frozen if the return value is not NULL. + * + * If this function returns NULL then the slab has been unfrozen. + */ +static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) +{ + struct slab new; + unsigned long counters; + void *freelist; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + + do { + freelist = slab->freelist; + counters = slab->counters; + + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = slab->objects; + new.frozen = freelist != NULL; + + } while (!__cmpxchg_double_slab(s, slab, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + +/* + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. + * + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. + * + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. + * + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is the slowest path since it involves + * a call to the page allocator and the setup of a new slab. + * + * Version of __slab_alloc to use when we know that preemption is + * already disabled (which is the case for bulk allocation). + */ +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) +{ + void *freelist; + struct slab *slab; + unsigned long flags; + struct partial_context pc; + + stat(s, ALLOC_SLOWPATH); + +reread_slab: + + slab = READ_ONCE(c->slab); + if (!slab) { + /* + * if the node is not online or has no normal memory, just + * ignore the node constraint + */ + if (unlikely(node != NUMA_NO_NODE && + !node_isset(node, slab_nodes))) + node = NUMA_NO_NODE; + goto new_slab; + } +redo: + + if (unlikely(!node_match(slab, node))) { + /* + * same as above but node_match() being false already + * implies node != NUMA_NO_NODE + */ + if (!node_isset(node, slab_nodes)) { + node = NUMA_NO_NODE; + } else { + stat(s, ALLOC_NODE_MISMATCH); + goto deactivate_slab; + } + } + + /* + * By rights, we should be searching for a slab page that was + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(slab, gfpflags))) + goto deactivate_slab; + + /* must check again c->slab in case we got preempted and it changed */ + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(slab != c->slab)) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_slab; + } + freelist = c->freelist; + if (freelist) + goto load_freelist; + + freelist = get_freelist(s, slab); + + if (!freelist) { + c->slab = NULL; + c->tid = next_tid(c->tid); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } + + stat(s, ALLOC_REFILL); + +load_freelist: + + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + + /* + * freelist is pointing to the list of objects to be used. + * slab is pointing to the slab from which the objects are obtained. + * That slab must be frozen for per cpu allocations to work. + */ + VM_BUG_ON(!c->slab->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + return freelist; + +deactivate_slab: + + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (slab != c->slab) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_slab; + } + freelist = c->freelist; + c->slab = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + deactivate_slab(s, slab, freelist); + +new_slab: + + if (slub_percpu_partial(c)) { + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->slab)) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + goto reread_slab; + } + if (unlikely(!slub_percpu_partial(c))) { + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + /* we were preempted and partial list got empty */ + goto new_objects; + } + + slab = c->slab = slub_percpu_partial(c); + slub_set_percpu_partial(c, slab); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + stat(s, CPU_PARTIAL_ALLOC); + goto redo; + } + +new_objects: + + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + freelist = get_partial(s, node, &pc); + if (freelist) + goto check_new_slab; + + slub_put_cpu_ptr(s->cpu_slab); + slab = new_slab(s, gfpflags, node); + c = slub_get_cpu_ptr(s->cpu_slab); + + if (unlikely(!slab)) { + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + + stat(s, ALLOC_SLAB); + + if (kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab, orig_size); + + if (unlikely(!freelist)) + goto new_objects; + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + + /* + * No other reference to the slab yet so we can + * muck around with it freely without cmpxchg + */ + freelist = slab->freelist; + slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; + + inc_slabs_node(s, slab_nid(slab), slab->objects); + +check_new_slab: + + if (kmem_cache_debug(s)) { + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + + if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + /* + * For !pfmemalloc_match() case we don't load freelist so that + * we don't make further mismatched allocations easier. + */ + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } + +retry_load_slab: + + local_lock_irqsave(&s->cpu_slab->lock, flags); + if (unlikely(c->slab)) { + void *flush_freelist = c->freelist; + struct slab *flush_slab = c->slab; + + c->slab = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + deactivate_slab(s, flush_slab, flush_freelist); + + stat(s, CPUSLAB_FLUSH); + + goto retry_load_slab; + } + c->slab = slab; + + goto load_freelist; +} + +/* + * A wrapper for ___slab_alloc() for contexts where preemption is not yet + * disabled. Compensates for possible cpu changes by refetching the per cpu area + * pointer. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) +{ + void *p; + +#ifdef CONFIG_PREEMPT_COUNT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling preemption. Need to reload cpu area + * pointer. + */ + c = slub_get_cpu_ptr(s->cpu_slab); +#endif + + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +#ifdef CONFIG_PREEMPT_COUNT + slub_put_cpu_ptr(s->cpu_slab); +#endif + return p; +} + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + struct kmem_cache_cpu *c; + struct slab *slab; + unsigned long tid; + struct obj_cgroup *objcg = NULL; + bool init = false; + + s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + if (!s) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + +redo: + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + * + * We must guarantee that tid and kmem_cache_cpu are retrieved on the + * same cpu. We read first the kmem_cache_cpu pointer and use it to read + * the tid. If we are preempted and switched to another cpu between the + * two reads, it's OK as the two are still associated with the same cpu + * and cmpxchg later will validate the cpu. + */ + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and slab associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * slab could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. + */ + barrier(); + + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + + object = c->freelist; + slab = c->slab; + + if (!USE_LOCKLESS_FAST_PATH() || + unlikely(!object || !slab || !node_match(slab, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); + } else { + void *next_object = get_freepointer_safe(s, object); + + /* + * The cmpxchg will only match if there was no additional + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock + * semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. + */ + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + next_object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } + prefetch_freepointer(s, next_object); + stat(s, ALLOC_FASTPATH); + } + + maybe_wipe_obj_freeptr(s, object); + init = slab_want_init_on_alloc(gfpflags, s); + +out: + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); + + return object; +} + +static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, unsigned long addr, size_t orig_size) +{ + return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); +} + +static __always_inline +void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) +{ + void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); + + return ret; +} + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + return __kmem_cache_alloc_lru(s, NULL, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_alloc); + +void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) +{ + return __kmem_cache_alloc_lru(s, lru, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_alloc_lru); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller) +{ + return slab_alloc_node(s, NULL, gfpflags, node, + caller, orig_size); +} + +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); + + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +/* + * Slow path handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. + * + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial slab + * handling required then we can return immediately. + */ +static void __slab_free(struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int cnt, + unsigned long addr) + +{ + void *prior; + int was_frozen; + struct slab new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long flags; + + stat(s, FREE_SLOWPATH); + + if (kfence_free(head)) + return; + + if (kmem_cache_debug(s)) { + free_debug_processing(s, slab, head, tail, cnt, addr); + return; + } + + do { + if (unlikely(n)) { + spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } + prior = slab->freelist; + counters = slab->counters; + set_freepointer(s, tail, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse -= cnt; + if ((!new.inuse || !prior) && !was_frozen) { + + if (kmem_cache_has_cpu_partial(s) && !prior) { + + /* + * Slab was on no list before and will be + * partially empty + * We can defer the list move and instead + * freeze it. + */ + new.frozen = 1; + + } else { /* Needs to be taken off a list */ + + n = get_node(s, slab_nid(slab)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } + } + + } while (!cmpxchg_double_slab(s, slab, + prior, counters, + head, new.counters, + "__slab_free")); + + if (likely(!n)) { + + if (likely(was_frozen)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + stat(s, FREE_FROZEN); + } else if (new.frozen) { + /* + * If we just froze the slab then put it onto the + * per cpu partial list. + */ + put_cpu_partial(s, slab, 1); + stat(s, CPU_PARTIAL_FREE); + } + + return; + } + + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) + goto slab_empty; + + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + remove_full(s, n, slab); + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return; + +slab_empty: + if (prior) { + /* + * Slab on the partial list. + */ + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } else { + /* Slab must be on the full list */ + remove_full(s, n, slab); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, FREE_SLAB); + discard_slab(s, slab); +} + +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + * + * Bulk free of a freelist with several objects (all pointing to the + * same slab) possible by specifying head and tail ptr, plus objects + * count (cnt). Bulk free indicated by tail pointer being set. + */ +static __always_inline void do_slab_free(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, + int cnt, unsigned long addr) +{ + void *tail_obj = tail ? : head; + struct kmem_cache_cpu *c; + unsigned long tid; + void **freelist; + +redo: + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succeed. + */ + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); + + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); + + if (unlikely(slab != c->slab)) { + __slab_free(s, slab, head, tail_obj, cnt, addr); + return; + } + + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); + + set_freepointer(s, tail_obj, freelist); + + if (unlikely(!this_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + freelist, tid, + head, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } + } else { + /* Update the free list under the local lock */ + local_lock(&s->cpu_slab->lock); + c = this_cpu_ptr(s->cpu_slab); + if (unlikely(slab != c->slab)) { + local_unlock(&s->cpu_slab->lock); + goto redo; + } + tid = c->tid; + freelist = c->freelist; + + set_freepointer(s, tail_obj, freelist); + c->freelist = head; + c->tid = next_tid(tid); + + local_unlock(&s->cpu_slab->lock); + } + stat(s, FREE_FASTPATH); +} + +static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, + void *head, void *tail, void **p, int cnt, + unsigned long addr) +{ + memcg_slab_free_hook(s, slab, p, cnt); + /* + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. + */ + if (slab_free_freelist_hook(s, &head, &tail, &cnt)) + do_slab_free(s, slab, head, tail, cnt, addr); +} + +#ifdef CONFIG_KASAN_GENERIC +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) +{ + do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); +} +#endif + +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) +{ + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); +} + +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + s = cache_from_obj(s, x); + if (!s) + return; + trace_kmem_cache_free(_RET_IP_, x, s); + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); +} +EXPORT_SYMBOL(kmem_cache_free); + +struct detached_freelist { + struct slab *slab; + void *tail; + void *freelist; + int cnt; + struct kmem_cache *s; +}; + +/* + * This function progressively scans the array with free objects (with + * a limited look ahead) and extract objects belonging to the same + * slab. It builds a detached freelist directly within the given + * slab/objects. This can happen without any need for + * synchronization, because the objects are owned by running process. + * The freelist is build up as a single linked list in the objects. + * The idea is, that this detached freelist can then be bulk + * transferred to the real freelist(s), but only requiring a single + * synchronization primitive. Look ahead in the array is limited due + * to performance reasons. + */ +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) +{ + int lookahead = 3; + void *object; + struct folio *folio; + size_t same; + + object = p[--size]; + folio = virt_to_folio(object); + if (!s) { + /* Handle kalloc'ed objects */ + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, object); + df->slab = NULL; + return size; + } + /* Derive kmem_cache from object */ + df->slab = folio_slab(folio); + df->s = df->slab->slab_cache; + } else { + df->slab = folio_slab(folio); + df->s = cache_from_obj(s, object); /* Support for memcg */ + } + + /* Start new detached freelist */ + df->tail = object; + df->freelist = object; + df->cnt = 1; + + if (is_kfence_address(object)) + return size; + + set_freepointer(df->s, object, NULL); + + same = size; + while (size) { + object = p[--size]; + /* df->slab is always set at this point */ + if (df->slab == virt_to_slab(object)) { + /* Opportunity build freelist */ + set_freepointer(df->s, object, df->freelist); + df->freelist = object; + df->cnt++; + same--; + if (size != same) + swap(p[size], p[same]); + continue; + } + + /* Limit look ahead search */ + if (!--lookahead) + break; + } + + return same; +} + +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (!size) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (!df.slab) + continue; + + slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt, + _RET_IP_); + } while (likely(size)); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + int i; + struct obj_cgroup *objcg = NULL; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + if (unlikely(!s)) + return false; + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + c = slub_get_cpu_ptr(s->cpu_slab); + local_lock_irq(&s->cpu_slab->lock); + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); + + if (unlikely(object)) { + p[i] = object; + continue; + } + + object = c->freelist; + if (unlikely(!object)) { + /* + * We may have removed an object from c->freelist using + * the fastpath in the previous iteration; in that case, + * c->tid has not been bumped yet. + * Since ___slab_alloc() may reenable interrupts while + * allocating memory, we should bump c->tid now. + */ + c->tid = next_tid(c->tid); + + local_unlock_irq(&s->cpu_slab->lock); + + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c, s->object_size); + if (unlikely(!p[i])) + goto error; + + c = this_cpu_ptr(s->cpu_slab); + maybe_wipe_obj_freeptr(s, p[i]); + + local_lock_irq(&s->cpu_slab->lock); + + continue; /* goto for-loop */ + } + c->freelist = get_freepointer(s, object); + p[i] = object; + maybe_wipe_obj_freeptr(s, p[i]); + } + c->tid = next_tid(c->tid); + local_unlock_irq(&s->cpu_slab->lock); + slub_put_cpu_ptr(s->cpu_slab); + + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s)); + return i; +error: + slub_put_cpu_ptr(s->cpu_slab); + slab_post_alloc_hook(s, objcg, flags, i, p, false); + kmem_cache_free_bulk(s, i, p); + return 0; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + +/* + * Object placement in a slab is made very easy because we always start at + * offset 0. If we tune the size of the object to the alignment then we can + * get the required alignment by putting one properly sized object after + * another. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and is therefore a factor in + * locking overhead. + */ + +/* + * Minimum / Maximum order of slab pages. This influences locking overhead + * and slab fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static unsigned int slub_min_order; +static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_min_objects; + +/* + * Calculate the order of allocation given an slab object size. + * + * The order of allocation has significant impact on performance and other + * system components. Generally order 0 allocations should be preferred since + * order 0 does not cause fragmentation in the page allocator. Larger objects + * be problematic to put into order 0 slabs because there may be too much + * unused space left. We go to a higher order if more than 1/16th of the slab + * would be wasted. + * + * In order to reach satisfactory performance we must ensure that a minimum + * number of objects is in one slab. Otherwise we may generate too much + * activity on the partial lists which requires taking the list_lock. This is + * less a concern for large slabs though which are rarely used. + * + * slub_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slub_max_order then + * we try to keep the page order as low as possible. So we accept more waste + * of space in favor of a small page order. + * + * Higher order allocations also allow the placement of more objects in a + * slab and thereby reduce object handling overhead. If the user has + * requested a higher minimum order then we start with that one instead of + * the smallest order which will fit the object. + */ +static inline unsigned int calc_slab_order(unsigned int size, + unsigned int min_objects, unsigned int max_order, + unsigned int fract_leftover) +{ + unsigned int min_order = slub_min_order; + unsigned int order; + + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; + + for (order = max(min_order, (unsigned int)get_order(min_objects * size)); + order <= max_order; order++) { + + unsigned int slab_size = (unsigned int)PAGE_SIZE << order; + unsigned int rem; + + rem = slab_size % size; + + if (rem <= slab_size / fract_leftover) + break; + } + + return order; +} + +static inline int calculate_order(unsigned int size) +{ + unsigned int order; + unsigned int min_objects; + unsigned int max_objects; + unsigned int nr_cpus; + + /* + * Attempt to find best configuration for a slab. This + * works by first attempting to generate a layout with + * the best configuration and backing off gradually. + * + * First we increase the acceptable waste in a slab. Then + * we reduce the minimum objects required in a slab. + */ + min_objects = slub_min_objects; + if (!min_objects) { + /* + * Some architectures will only update present cpus when + * onlining them, so don't trust the number if it's just 1. But + * we also don't want to use nr_cpu_ids always, as on some other + * architectures, there can be many possible cpus, but never + * onlined. Here we compromise between trying to avoid too high + * order on systems that appear larger than they are, and too + * low order on systems that appear smaller than they are. + */ + nr_cpus = num_present_cpus(); + if (nr_cpus <= 1) + nr_cpus = nr_cpu_ids; + min_objects = 4 * (fls(nr_cpus) + 1); + } + max_objects = order_objects(slub_max_order, size); + min_objects = min(min_objects, max_objects); + + while (min_objects > 1) { + unsigned int fraction; + + fraction = 16; + while (fraction >= 4) { + order = calc_slab_order(size, min_objects, + slub_max_order, fraction); + if (order <= slub_max_order) + return order; + fraction /= 2; + } + min_objects--; + } + + /* + * We were unable to place multiple objects in a slab. Now + * lets see if we can place a single object there. + */ + order = calc_slab_order(size, 1, slub_max_order, 1); + if (order <= slub_max_order) + return order; + + /* + * Doh this slab cannot be placed using slub_max_order. + */ + order = calc_slab_order(size, 1, MAX_ORDER, 1); + if (order < MAX_ORDER) + return order; + return -ENOSYS; +} + +static void +init_kmem_cache_node(struct kmem_cache_node *n) +{ + n->nr_partial = 0; + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); + INIT_LIST_HEAD(&n->full); +#endif +} + +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +{ + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + + /* + * Must align to double word boundary for the double cmpxchg + * instructions to work; see __pcpu_double_call_return_bool(). + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), + 2 * sizeof(void *)); + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); + + return 1; +} + +static struct kmem_cache *kmem_cache_node; + +/* + * No kmalloc_node yet so do it by hand. We know that this is the first + * slab on the node for this slabcache. There are no concurrent accesses + * possible. + * + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping + * memory on a fresh node that has no slab structures yet. + */ +static void early_kmem_cache_node_alloc(int node) +{ + struct slab *slab; + struct kmem_cache_node *n; + + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); + + slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); + + BUG_ON(!slab); + inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); + if (slab_nid(slab) != node) { + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); + } + + n = slab->freelist; + BUG_ON(!n); +#ifdef CONFIG_SLUB_DEBUG + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); +#endif + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); + slab->freelist = get_freepointer(kmem_cache_node, n); + slab->inuse = 1; + kmem_cache_node->node[node] = n; + init_kmem_cache_node(n); + inc_slabs_node(kmem_cache_node, node, slab->objects); + + /* + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. + */ + __add_partial(n, slab, DEACTIVATE_TO_HEAD); +} + +static void free_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + s->node[node] = NULL; + kmem_cache_free(kmem_cache_node, n); + } +} + +void __kmem_cache_release(struct kmem_cache *s) +{ + cache_random_seq_destroy(s); + free_percpu(s->cpu_slab); + free_kmem_cache_nodes(s); +} + +static int init_kmem_cache_nodes(struct kmem_cache *s) +{ + int node; + + for_each_node_mask(node, slab_nodes) { + struct kmem_cache_node *n; + + if (slab_state == DOWN) { + early_kmem_cache_node_alloc(node); + continue; + } + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); + + if (!n) { + free_kmem_cache_nodes(s); + return 0; + } + + init_kmem_cache_node(n); + s->node[node] = n; + } + return 1; +} + +static void set_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + unsigned int nr_objects; + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * For backwards compatibility reasons, this is determined as number + * of objects, even though we now limit maximum number of pages, see + * slub_set_cpu_partial() + */ + if (!kmem_cache_has_cpu_partial(s)) + nr_objects = 0; + else if (s->size >= PAGE_SIZE) + nr_objects = 6; + else if (s->size >= 1024) + nr_objects = 24; + else if (s->size >= 256) + nr_objects = 52; + else + nr_objects = 120; + + slub_set_cpu_partial(s, nr_objects); +#endif +} + +/* + * calculate_sizes() determines the order and the distribution of data within + * a slab object. + */ +static int calculate_sizes(struct kmem_cache *s) +{ + slab_flags_t flags = s->flags; + unsigned int size = s->object_size; + unsigned int order; + + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); + +#ifdef CONFIG_SLUB_DEBUG + /* + * Determine if we can poison the object itself. If the user of + * the slab may touch the object after free or before allocation + * then we should never poison the object itself. + */ + if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && + !s->ctor) + s->flags |= __OBJECT_POISON; + else + s->flags &= ~__OBJECT_POISON; + + + /* + * If we are Redzoning then check if there is some space between the + * end of the object and the free pointer. If not then add an + * additional word to have some bytes to store Redzone information. + */ + if ((flags & SLAB_RED_ZONE) && size == s->object_size) + size += sizeof(void *); +#endif + + /* + * With that we have determined the number of bytes in actual use + * by the object and redzoning. + */ + s->inuse = size; + + if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || + s->ctor) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor, are poisoning the objects, or are + * redzoning an object smaller than sizeof(void *). + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the + * freeptr_outside_object() function. If that is no + * longer true, the function needs to be modified. + */ + s->offset = size; + size += sizeof(void *); + } else { + /* + * Store freelist pointer near middle of object to keep + * it away from the edges of the object to avoid small + * sized over/underflows from neighboring allocations. + */ + s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); + } + +#ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) { + /* + * Need to store information about allocs and frees after + * the object. + */ + size += 2 * sizeof(struct track); + + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } +#endif + + kasan_cache_create(s, &size, &s->flags); +#ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_RED_ZONE) { + /* + * Add some empty padding so that we can catch + * overwrites from earlier objects rather than let + * tracking information or the free pointer be + * corrupted if a user writes before the start + * of the object. + */ + size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } +#endif + + /* + * SLUB stores one object immediately after another beginning from + * offset 0. In order to align the objects we have to simply size + * each object to conform to the alignment. + */ + size = ALIGN(size, s->align); + s->size = size; + s->reciprocal_size = reciprocal_value(size); + order = calculate_order(size); + + if ((int)order < 0) + return 0; + + s->allocflags = 0; + if (order) + s->allocflags |= __GFP_COMP; + + if (s->flags & SLAB_CACHE_DMA) + s->allocflags |= GFP_DMA; + + if (s->flags & SLAB_CACHE_DMA32) + s->allocflags |= GFP_DMA32; + + if (s->flags & SLAB_RECLAIM_ACCOUNT) + s->allocflags |= __GFP_RECLAIMABLE; + + /* + * Determine the number of objects per slab + */ + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); + + return !!oo_objects(s->oo); +} + +static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) +{ + s->flags = kmem_cache_flags(s->size, flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + + if (!calculate_sizes(s)) + goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s)) + goto error; + } + } + +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ + defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); + + set_cpu_partial(s); + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto error; + } + + if (!init_kmem_cache_nodes(s)) + goto error; + + if (alloc_kmem_cache_cpus(s)) + return 0; + +error: + __kmem_cache_release(s); + return -EINVAL; +} + +static void list_slab_objects(struct kmem_cache *s, struct slab *slab, + const char *text) +{ +#ifdef CONFIG_SLUB_DEBUG + void *addr = slab_address(slab); + void *p; + + slab_err(s, slab, text, s->name); + + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + + for_each_object(p, s, addr, slab->objects) { + + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); + print_tracking(s, p); + } + } + spin_unlock(&object_map_lock); +#endif +} + +/* + * Attempt to free all partial slabs on a node. + * This is called from __kmem_cache_shutdown(). We must take list_lock + * because sysfs file might still access partial list after the shutdowning. + */ +static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) +{ + LIST_HEAD(discard); + struct slab *slab, *h; + + BUG_ON(irqs_disabled()); + spin_lock_irq(&n->list_lock); + list_for_each_entry_safe(slab, h, &n->partial, slab_list) { + if (!slab->inuse) { + remove_partial(n, slab); + list_add(&slab->slab_list, &discard); + } else { + list_slab_objects(s, slab, + "Objects remaining in %s on __kmem_cache_shutdown()"); + } + } + spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(slab, h, &discard, slab_list) + discard_slab(s, slab); +} + +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (n->nr_partial || slabs_node(s, node)) + return false; + return true; +} + +/* + * Release all resources used by a slab cache. + */ +int __kmem_cache_shutdown(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + flush_all_cpus_locked(s); + /* Attempt to free all objects */ + for_each_kmem_cache_node(s, node, n) { + free_partial(s, n); + if (n->nr_partial || slabs_node(s, node)) + return 1; + } + return 0; +} + +#ifdef CONFIG_PRINTK +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +{ + void *base; + int __maybe_unused i; + unsigned int objnr; + void *objp; + void *objp0; + struct kmem_cache *s = slab->slab_cache; + struct track __maybe_unused *trackp; + + kpp->kp_ptr = object; + kpp->kp_slab = slab; + kpp->kp_slab_cache = s; + base = slab_address(slab); + objp0 = kasan_reset_tag(object); +#ifdef CONFIG_SLUB_DEBUG + objp = restore_red_left(s, objp0); +#else + objp = objp0; +#endif + objnr = obj_to_index(s, slab, objp); + kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); + objp = base + s->size * objnr; + kpp->kp_objp = objp; + if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size + || (objp - base) % s->size) || + !(s->flags & SLAB_STORE_USER)) + return; +#ifdef CONFIG_SLUB_DEBUG + objp = fixup_red_left(s, objp); + trackp = get_track(s, objp, TRACK_ALLOC); + kpp->kp_ret = (void *)trackp->addr; +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; + + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } + + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } + } +#endif +#endif +} +#endif + +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ + +static int __init setup_slub_min_order(char *str) +{ + get_option(&str, (int *)&slub_min_order); + + return 1; +} + +__setup("slub_min_order=", setup_slub_min_order); + +static int __init setup_slub_max_order(char *str) +{ + get_option(&str, (int *)&slub_max_order); + slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); + + return 1; +} + +__setup("slub_max_order=", setup_slub_max_order); + +static int __init setup_slub_min_objects(char *str) +{ + get_option(&str, (int *)&slub_min_objects); + + return 1; +} + +__setup("slub_min_objects=", setup_slub_min_objects); + +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects incorrectly sized objects and objects that are to be copied + * to/from userspace but do not fall entirely within the containing slab + * cache's usercopy region. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) +{ + struct kmem_cache *s; + unsigned int offset; + bool is_kfence = is_kfence_address(ptr); + + ptr = kasan_reset_tag(ptr); + + /* Find object and usable object size. */ + s = slab->slab_cache; + + /* Reject impossible pointers. */ + if (ptr < slab_address(slab)) + usercopy_abort("SLUB object not in SLUB page?!", NULL, + to_user, 0, n); + + /* Find offset within object. */ + if (is_kfence) + offset = ptr - kfence_object_start(ptr); + else + offset = (ptr - slab_address(slab)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { + if (offset < s->red_left_pad) + usercopy_abort("SLUB object in left red zone", + s->name, to_user, offset, n); + offset -= s->red_left_pad; + } + + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && + n <= s->useroffset - offset + s->usersize) + return; + + usercopy_abort("SLUB object", s->name, to_user, offset, n); +} +#endif /* CONFIG_HARDENED_USERCOPY */ + +#define SHRINK_PROMOTE_MAX 32 + +/* + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. + * + * The slabs with the least items are placed last. This results in them + * being allocated from last increasing the chance that the last objects + * are freed in them. + */ +static int __kmem_cache_do_shrink(struct kmem_cache *s) +{ + int node; + int i; + struct kmem_cache_node *n; + struct slab *slab; + struct slab *t; + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; + unsigned long flags; + int ret = 0; + + for_each_kmem_cache_node(s, node, n) { + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); + + spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists of slabs to discard or promote. + * + * Note that concurrent frees may occur while we hold the + * list_lock. slab->inuse here is the upper limit. + */ + list_for_each_entry_safe(slab, t, &n->partial, slab_list) { + int free = slab->objects - slab->inuse; + + /* Do not reread slab->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == slab->objects) { + list_move(&slab->slab_list, &discard); + n->nr_partial--; + dec_slabs_node(s, node, slab->objects); + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&slab->slab_list, promote + free - 1); + } + + /* + * Promote the slabs filled up most to the head of the + * partial list. + */ + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); + + spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(slab, t, &discard, slab_list) + free_slab(s, slab); + + if (slabs_node(s, node)) + ret = 1; + } + + return ret; +} + +int __kmem_cache_shrink(struct kmem_cache *s) +{ + flush_all(s); + return __kmem_cache_do_shrink(s); +} + +static int slab_mem_going_offline_callback(void *arg) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + flush_all_cpus_locked(s); + __kmem_cache_do_shrink(s); + } + mutex_unlock(&slab_mutex); + + return 0; +} + +static void slab_mem_offline_callback(void *arg) +{ + struct memory_notify *marg = arg; + int offline_node; + + offline_node = marg->status_change_nid_normal; + + /* + * If the node still has available memory. we need kmem_cache_node + * for it yet. + */ + if (offline_node < 0) + return; + + mutex_lock(&slab_mutex); + node_clear(offline_node, slab_nodes); + /* + * We no longer free kmem_cache_node structures here, as it would be + * racy with all get_node() users, and infeasible to protect them with + * slab_mutex. + */ + mutex_unlock(&slab_mutex); +} + +static int slab_mem_going_online_callback(void *arg) +{ + struct kmem_cache_node *n; + struct kmem_cache *s; + struct memory_notify *marg = arg; + int nid = marg->status_change_nid_normal; + int ret = 0; + + /* + * If the node's memory is already available, then kmem_cache_node is + * already created. Nothing to do. + */ + if (nid < 0) + return 0; + + /* + * We are bringing a node online. No memory is available yet. We must + * allocate a kmem_cache_node structure in order to bring the node + * online. + */ + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + /* + * The structure may already exist if the node was previously + * onlined and offlined. + */ + if (get_node(s, nid)) + continue; + /* + * XXX: kmem_cache_alloc_node will fallback to other nodes + * since memory is not yet available from the node that + * is brought up. + */ + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + init_kmem_cache_node(n); + s->node[nid] = n; + } + /* + * Any cache created after this point will also have kmem_cache_node + * initialized for the new node. + */ + node_set(nid, slab_nodes); +out: + mutex_unlock(&slab_mutex); + return ret; +} + +static int slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + ret = slab_mem_going_online_callback(arg); + break; + case MEM_GOING_OFFLINE: + ret = slab_mem_going_offline_callback(arg); + break; + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + slab_mem_offline_callback(arg); + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + return ret; +} + +static struct notifier_block slab_memory_callback_nb = { + .notifier_call = slab_memory_callback, + .priority = SLAB_CALLBACK_PRI, +}; + +/******************************************************************** + * Basic setup of slabs + *******************************************************************/ + +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator. Allocate them properly then fix up the pointers + * that may be pointing to the wrong kmem_cache structure. + */ + +static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) +{ + int node; + struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; + + memcpy(s, static_cache, kmem_cache->object_size); + + /* + * This runs very early, and only the boot processor is supposed to be + * up. Even if it weren't true, IRQs are not up so we couldn't fire + * IPIs around. + */ + __flush_cpu_slab(s, smp_processor_id()); + for_each_kmem_cache_node(s, node, n) { + struct slab *p; + + list_for_each_entry(p, &n->partial, slab_list) + p->slab_cache = s; + +#ifdef CONFIG_SLUB_DEBUG + list_for_each_entry(p, &n->full, slab_list) + p->slab_cache = s; +#endif + } + list_add(&s->list, &slab_caches); + return s; +} + +void __init kmem_cache_init(void) +{ + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; + int node; + + if (debug_guardpage_minorder()) + slub_max_order = 0; + + /* Print slub debugging pointers without hashing */ + if (__slub_debug_enabled()) + no_hash_pointers_enable(NULL); + + kmem_cache_node = &boot_kmem_cache_node; + kmem_cache = &boot_kmem_cache; + + /* + * Initialize the nodemask for which we will allocate per node + * structures. Here we don't need taking slab_mutex yet. + */ + for_each_node_state(node, N_NORMAL_MEMORY) + node_set(node, slab_nodes); + + create_boot_cache(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); + + register_hotmemory_notifier(&slab_memory_callback_nb); + + /* Able to allocate the per node structures */ + slab_state = PARTIAL; + + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), + SLAB_HWCACHE_ALIGN, 0, 0); + + kmem_cache = bootstrap(&boot_kmem_cache); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); + + /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); + create_kmalloc_caches(0); + + /* Setup random freelists for each cache */ + init_freelist_randomization(); + + cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, + slub_cpu_dead); + + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", + cache_line_size(), + slub_min_order, slub_max_order, slub_min_objects, + nr_cpu_ids, nr_node_ids); +} + +void __init kmem_cache_init_late(void) +{ + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + WARN_ON(!flushwq); +} + +struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ + struct kmem_cache *s; + + s = find_mergeable(size, align, flags, name, ctor); + if (s) { + if (sysfs_slab_alias(s, name)) + return NULL; + + s->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->object_size = max(s->object_size, size); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); + } + + return s; +} + +int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +{ + int err; + + err = kmem_cache_open(s, flags); + if (err) + return err; + + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + return 0; + + err = sysfs_slab_add(s); + if (err) { + __kmem_cache_release(s); + return err; + } + + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + + return 0; +} + +#ifdef CONFIG_SYSFS +static int count_inuse(struct slab *slab) +{ + return slab->inuse; +} + +static int count_total(struct slab *slab) +{ + return slab->objects; +} +#endif + +#ifdef CONFIG_SLUB_DEBUG +static void validate_slab(struct kmem_cache *s, struct slab *slab, + unsigned long *obj_map) +{ + void *p; + void *addr = slab_address(slab); + + if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) + return; + + /* Now we know that a valid freelist exists */ + __fill_map(obj_map, s, slab); + for_each_object(p, s, addr, slab->objects) { + u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ? + SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; + + if (!check_object(s, slab, p, val)) + break; + } +} + +static int validate_slab_node(struct kmem_cache *s, + struct kmem_cache_node *n, unsigned long *obj_map) +{ + unsigned long count = 0; + struct slab *slab; + unsigned long flags; + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(slab, &n->partial, slab_list) { + validate_slab(s, slab, obj_map); + count++; + } + if (count != n->nr_partial) { + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); + slab_add_kunit_errors(); + } + + if (!(s->flags & SLAB_STORE_USER)) + goto out; + + list_for_each_entry(slab, &n->full, slab_list) { + validate_slab(s, slab, obj_map); + count++; + } + if (count != atomic_long_read(&n->nr_slabs)) { + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); + slab_add_kunit_errors(); + } + +out: + spin_unlock_irqrestore(&n->list_lock, flags); + return count; +} + +long validate_slab_cache(struct kmem_cache *s) +{ + int node; + unsigned long count = 0; + struct kmem_cache_node *n; + unsigned long *obj_map; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) + return -ENOMEM; + + flush_all(s); + for_each_kmem_cache_node(s, node, n) + count += validate_slab_node(s, n, obj_map); + + bitmap_free(obj_map); + + return count; +} +EXPORT_SYMBOL(validate_slab_cache); + +#ifdef CONFIG_DEBUG_FS +/* + * Generate lists of code addresses where slabcache objects are allocated + * and freed. + */ + +struct location { + depot_stack_handle_t handle; + unsigned long count; + unsigned long addr; + unsigned long waste; + long long sum_time; + long min_time; + long max_time; + long min_pid; + long max_pid; + DECLARE_BITMAP(cpus, NR_CPUS); + nodemask_t nodes; +}; + +struct loc_track { + unsigned long max; + unsigned long count; + struct location *loc; + loff_t idx; +}; + +static struct dentry *slab_debugfs_root; + +static void free_loc_track(struct loc_track *t) +{ + if (t->max) + free_pages((unsigned long)t->loc, + get_order(sizeof(struct location) * t->max)); +} + +static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) +{ + struct location *l; + int order; + + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); + if (!l) + return 0; + + if (t->count) { + memcpy(l, t->loc, sizeof(struct location) * t->count); + free_loc_track(t); + } + t->max = max; + t->loc = l; + return 1; +} + +static int add_location(struct loc_track *t, struct kmem_cache *s, + const struct track *track, + unsigned int orig_size) +{ + long start, end, pos; + struct location *l; + unsigned long caddr, chandle, cwaste; + unsigned long age = jiffies - track->when; + depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; + +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(track->handle); +#endif + start = -1; + end = t->count; + + for ( ; ; ) { + pos = start + (end - start + 1) / 2; + + /* + * There is nothing at "end". If we end up there + * we need to add something to before end. + */ + if (pos == end) + break; + + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { + + l->count++; + if (track->when) { + l->sum_time += age; + if (age < l->min_time) + l->min_time = age; + if (age > l->max_time) + l->max_time = age; + + if (track->pid < l->min_pid) + l->min_pid = track->pid; + if (track->pid > l->max_pid) + l->max_pid = track->pid; + + cpumask_set_cpu(track->cpu, + to_cpumask(l->cpus)); + } + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; + } + + if (track->addr < caddr) + end = pos; + else if (track->addr == caddr && handle < chandle) + end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; + else + start = pos; + } + + /* + * Not found. Insert new tracking element. + */ + if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC)) + return 0; + + l = t->loc + pos; + if (pos < t->count) + memmove(l + 1, l, + (t->count - pos) * sizeof(struct location)); + t->count++; + l->count = 1; + l->addr = track->addr; + l->sum_time = age; + l->min_time = age; + l->max_time = age; + l->min_pid = track->pid; + l->max_pid = track->pid; + l->handle = handle; + l->waste = waste; + cpumask_clear(to_cpumask(l->cpus)); + cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); + nodes_clear(l->nodes); + node_set(page_to_nid(virt_to_page(track)), l->nodes); + return 1; +} + +static void process_slab(struct loc_track *t, struct kmem_cache *s, + struct slab *slab, enum track_item alloc, + unsigned long *obj_map) +{ + void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); + void *p; + + __fill_map(obj_map, s, slab); + + for_each_object(p, s, addr, slab->objects) + if (!test_bit(__obj_to_index(s, addr, p), obj_map)) + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); +} +#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_SYSFS +enum slab_stat_type { + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ +}; + +#define SO_ALL (1 << SL_ALL) +#define SO_PARTIAL (1 << SL_PARTIAL) +#define SO_CPU (1 << SL_CPU) +#define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) + +static ssize_t show_slab_objects(struct kmem_cache *s, + char *buf, unsigned long flags) +{ + unsigned long total = 0; + int node; + int x; + unsigned long *nodes; + int len = 0; + + nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); + if (!nodes) + return -ENOMEM; + + if (flags & SO_CPU) { + int cpu; + + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); + int node; + struct slab *slab; + + slab = READ_ONCE(c->slab); + if (!slab) + continue; + + node = slab_nid(slab); + if (flags & SO_TOTAL) + x = slab->objects; + else if (flags & SO_OBJECTS) + x = slab->inuse; + else + x = 1; + + total += x; + nodes[node] += x; + +#ifdef CONFIG_SLUB_CPU_PARTIAL + slab = slub_percpu_partial_read_once(c); + if (slab) { + node = slab_nid(slab); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); + else if (flags & SO_OBJECTS) + WARN_ON_ONCE(1); + else + x = slab->slabs; + total += x; + nodes[node] += x; + } +#endif + } + } + + /* + * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" + * already held which will conflict with an existing lock order: + * + * mem_hotplug_lock->slab_mutex->kernfs_mutex + * + * We don't really need mem_hotplug_lock (to hold off + * slab_mem_going_offline_callback) here because slab's memory hot + * unplug code doesn't destroy the kmem_cache->node[] data. + */ + +#ifdef CONFIG_SLUB_DEBUG + if (flags & SO_ALL) { + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); + else + x = atomic_long_read(&n->nr_slabs); + total += x; + nodes[node] += x; + } + + } else +#endif + if (flags & SO_PARTIAL) { + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); + else + x = n->nr_partial; + total += x; + nodes[node] += x; + } + } + + len += sysfs_emit_at(buf, len, "%lu", total); +#ifdef CONFIG_NUMA + for (node = 0; node < nr_node_ids; node++) { + if (nodes[node]) + len += sysfs_emit_at(buf, len, " N%d=%lu", + node, nodes[node]); + } +#endif + len += sysfs_emit_at(buf, len, "\n"); + kfree(nodes); + + return len; +} + +#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) +#define to_slab(n) container_of(n, struct kmem_cache, kobj) + +struct slab_attribute { + struct attribute attr; + ssize_t (*show)(struct kmem_cache *s, char *buf); + ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count); +}; + +#define SLAB_ATTR_RO(_name) \ + static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) + +#define SLAB_ATTR(_name) \ + static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) + +static ssize_t slab_size_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->size); +} +SLAB_ATTR_RO(slab_size); + +static ssize_t align_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->align); +} +SLAB_ATTR_RO(align); + +static ssize_t object_size_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->object_size); +} +SLAB_ATTR_RO(object_size); + +static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", oo_objects(s->oo)); +} +SLAB_ATTR_RO(objs_per_slab); + +static ssize_t order_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", oo_order(s->oo)); +} +SLAB_ATTR_RO(order); + +static ssize_t min_partial_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%lu\n", s->min_partial); +} + +static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long min; + int err; + + err = kstrtoul(buf, 10, &min); + if (err) + return err; + + s->min_partial = min; + return length; +} +SLAB_ATTR(min_partial); + +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + unsigned int nr_partial = 0; +#ifdef CONFIG_SLUB_CPU_PARTIAL + nr_partial = s->cpu_partial; +#endif + + return sysfs_emit(buf, "%u\n", nr_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned int objects; + int err; + + err = kstrtouint(buf, 10, &objects); + if (err) + return err; + if (objects && !kmem_cache_has_cpu_partial(s)) + return -EINVAL; + + slub_set_cpu_partial(s, objects); + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + +static ssize_t ctor_show(struct kmem_cache *s, char *buf) +{ + if (!s->ctor) + return 0; + return sysfs_emit(buf, "%pS\n", s->ctor); +} +SLAB_ATTR_RO(ctor); + +static ssize_t aliases_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); +} +SLAB_ATTR_RO(aliases); + +static ssize_t partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL); +} +SLAB_ATTR_RO(partial); + +static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_CPU); +} +SLAB_ATTR_RO(cpu_slabs); + +static ssize_t objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects); + +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int slabs = 0; + int cpu __maybe_unused; + int len = 0; + +#ifdef CONFIG_SLUB_CPU_PARTIAL + for_each_online_cpu(cpu) { + struct slab *slab; + + slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); + + if (slab) + slabs += slab->slabs; + } +#endif + + /* Approximate half-full slabs, see slub_set_cpu_partial() */ + objects = (slabs * oo_objects(s->oo)) / 2; + len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); + +#if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP) + for_each_online_cpu(cpu) { + struct slab *slab; + + slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); + if (slab) { + slabs = READ_ONCE(slab->slabs); + objects = (slabs * oo_objects(s->oo)) / 2; + len += sysfs_emit_at(buf, len, " C%d=%d(%d)", + cpu, objects, slabs); + } + } +#endif + len += sysfs_emit_at(buf, len, "\n"); + + return len; +} +SLAB_ATTR_RO(slabs_cpu_partial); + +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); +} +SLAB_ATTR_RO(reclaim_account); + +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); +} +SLAB_ATTR_RO(hwcache_align); + +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); +} +SLAB_ATTR_RO(cache_dma); +#endif + +static ssize_t usersize_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->usersize); +} +SLAB_ATTR_RO(usersize); + +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); +} +SLAB_ATTR_RO(destroy_by_rcu); + +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL); +} +SLAB_ATTR_RO(slabs); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); +} +SLAB_ATTR_RO(sanity_checks); + +static ssize_t trace_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE)); +} +SLAB_ATTR_RO(trace); + +static ssize_t red_zone_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); +} + +SLAB_ATTR_RO(red_zone); + +static ssize_t poison_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON)); +} + +SLAB_ATTR_RO(poison); + +static ssize_t store_user_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); +} + +SLAB_ATTR_RO(store_user); + +static ssize_t validate_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t validate_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = -EINVAL; + + if (buf[0] == '1' && kmem_cache_debug(s)) { + ret = validate_slab_cache(s); + if (ret >= 0) + ret = length; + } + return ret; +} +SLAB_ATTR(validate); + +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} +SLAB_ATTR_RO(failslab); +#endif + +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') + kmem_cache_shrink(s); + else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + +#ifdef CONFIG_NUMA +static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10); +} + +static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + unsigned int ratio; + int err; + + err = kstrtouint(buf, 10, &ratio); + if (err) + return err; + if (ratio > 100) + return -ERANGE; + + s->remote_node_defrag_ratio = ratio * 10; + + return length; +} +SLAB_ATTR(remote_node_defrag_ratio); +#endif + +#ifdef CONFIG_SLUB_STATS +static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) +{ + unsigned long sum = 0; + int cpu; + int len = 0; + int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + for_each_online_cpu(cpu) { + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; + + data[cpu] = x; + sum += x; + } + + len += sysfs_emit_at(buf, len, "%lu", sum); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + if (data[cpu]) + len += sysfs_emit_at(buf, len, " C%d=%u", + cpu, data[cpu]); + } +#endif + kfree(data); + len += sysfs_emit_at(buf, len, "\n"); + + return len; +} + +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; +} + +#define STAT_ATTR(si, text) \ +static ssize_t text##_show(struct kmem_cache *s, char *buf) \ +{ \ + return show_stat(s, buf, si); \ +} \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ + +STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); +STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_FASTPATH, free_fastpath); +STAT_ATTR(FREE_SLOWPATH, free_slowpath); +STAT_ATTR(FREE_FROZEN, free_frozen); +STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); +STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); +STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); +STAT_ATTR(ALLOC_SLAB, alloc_slab); +STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); +STAT_ATTR(FREE_SLAB, free_slab); +STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); +STAT_ATTR(DEACTIVATE_FULL, deactivate_full); +STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); +STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); +STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); +STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); +STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); +STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); +STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +#endif /* CONFIG_SLUB_STATS */ + +#ifdef CONFIG_KFENCE +static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE)); +} + +static ssize_t skip_kfence_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = length; + + if (buf[0] == '0') + s->flags &= ~SLAB_SKIP_KFENCE; + else if (buf[0] == '1') + s->flags |= SLAB_SKIP_KFENCE; + else + ret = -EINVAL; + + return ret; +} +SLAB_ATTR(skip_kfence); +#endif + +static struct attribute *slab_attrs[] = { + &slab_size_attr.attr, + &object_size_attr.attr, + &objs_per_slab_attr.attr, + &order_attr.attr, + &min_partial_attr.attr, + &cpu_partial_attr.attr, + &objects_attr.attr, + &objects_partial_attr.attr, + &partial_attr.attr, + &cpu_slabs_attr.attr, + &ctor_attr.attr, + &aliases_attr.attr, + &align_attr.attr, + &hwcache_align_attr.attr, + &reclaim_account_attr.attr, + &destroy_by_rcu_attr.attr, + &shrink_attr.attr, + &slabs_cpu_partial_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, + &red_zone_attr.attr, + &poison_attr.attr, + &store_user_attr.attr, + &validate_attr.attr, +#endif +#ifdef CONFIG_ZONE_DMA + &cache_dma_attr.attr, +#endif +#ifdef CONFIG_NUMA + &remote_node_defrag_ratio_attr.attr, +#endif +#ifdef CONFIG_SLUB_STATS + &alloc_fastpath_attr.attr, + &alloc_slowpath_attr.attr, + &free_fastpath_attr.attr, + &free_slowpath_attr.attr, + &free_frozen_attr.attr, + &free_add_partial_attr.attr, + &free_remove_partial_attr.attr, + &alloc_from_partial_attr.attr, + &alloc_slab_attr.attr, + &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, + &free_slab_attr.attr, + &cpuslab_flush_attr.attr, + &deactivate_full_attr.attr, + &deactivate_empty_attr.attr, + &deactivate_to_head_attr.attr, + &deactivate_to_tail_attr.attr, + &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, + &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, + &cpu_partial_node_attr.attr, + &cpu_partial_drain_attr.attr, +#endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + &usersize_attr.attr, +#ifdef CONFIG_KFENCE + &skip_kfence_attr.attr, +#endif + + NULL +}; + +static const struct attribute_group slab_attr_group = { + .attrs = slab_attrs, +}; + +static ssize_t slab_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(s, buf); +} + +static ssize_t slab_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct slab_attribute *attribute; + struct kmem_cache *s; + + attribute = to_slab_attr(attr); + s = to_slab(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(s, buf, len); +} + +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + +static const struct sysfs_ops slab_sysfs_ops = { + .show = slab_attr_show, + .store = slab_attr_store, +}; + +static struct kobj_type slab_ktype = { + .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, +}; + +static struct kset *slab_kset; + +static inline struct kset *cache_kset(struct kmem_cache *s) +{ + return slab_kset; +} + +#define ID_STR_LENGTH 32 + +/* Create a unique string id for a slab cache: + * + * Format :[flags-]size + */ +static char *create_unique_id(struct kmem_cache *s) +{ + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + + if (!name) + return ERR_PTR(-ENOMEM); + + *p++ = ':'; + /* + * First flags affecting slabcache operations. We will only + * get here for aliasable slabs so we do not need to support + * too many flags. The flags here must cover all flags that + * are matched during merging to guarantee that the id is + * unique. + */ + if (s->flags & SLAB_CACHE_DMA) + *p++ = 'd'; + if (s->flags & SLAB_CACHE_DMA32) + *p++ = 'D'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *p++ = 'a'; + if (s->flags & SLAB_CONSISTENCY_CHECKS) + *p++ = 'F'; + if (s->flags & SLAB_ACCOUNT) + *p++ = 'A'; + if (p != name + 1) + *p++ = '-'; + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); + + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } + kmsan_unpoison_memory(name, p - name); + return name; +} + +static int sysfs_slab_add(struct kmem_cache *s) +{ + int err; + const char *name; + struct kset *kset = cache_kset(s); + int unmergeable = slab_unmergeable(s); + + if (!kset) { + kobject_init(&s->kobj, &slab_ktype); + return 0; + } + + if (!unmergeable && disable_higher_order_debug && + (slub_debug & DEBUG_METADATA_FLAGS)) + unmergeable = 1; + + if (unmergeable) { + /* + * Slabcache can never be merged so we can use the name proper. + * This is typically the case for debug situations. In that + * case we can catch duplicate names easily. + */ + sysfs_remove_link(&slab_kset->kobj, s->name); + name = s->name; + } else { + /* + * Create a unique name for the slab as a target + * for the symlinks. + */ + name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); + } + + s->kobj.kset = kset; + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out; + + err = sysfs_create_group(&s->kobj, &slab_attr_group); + if (err) + goto out_del_kobj; + + if (!unmergeable) { + /* Setup first alias */ + sysfs_slab_alias(s, s->name); + } +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); + goto out; +} + +void sysfs_slab_unlink(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_del(&s->kobj); +} + +void sysfs_slab_release(struct kmem_cache *s) +{ + if (slab_state >= FULL) + kobject_put(&s->kobj); +} + +/* + * Need to buffer aliases during bootup until sysfs becomes + * available lest we lose that information. + */ +struct saved_alias { + struct kmem_cache *s; + const char *name; + struct saved_alias *next; +}; + +static struct saved_alias *alias_list; + +static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +{ + struct saved_alias *al; + + if (slab_state == FULL) { + /* + * If we have a leftover link then remove it. + */ + sysfs_remove_link(&slab_kset->kobj, name); + return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); + } + + al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); + if (!al) + return -ENOMEM; + + al->s = s; + al->name = name; + al->next = alias_list; + alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); + return 0; +} + +static int __init slab_sysfs_init(void) +{ + struct kmem_cache *s; + int err; + + mutex_lock(&slab_mutex); + + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); + if (!slab_kset) { + mutex_unlock(&slab_mutex); + pr_err("Cannot register slab subsystem.\n"); + return -ENOSYS; + } + + slab_state = FULL; + + list_for_each_entry(s, &slab_caches, list) { + err = sysfs_slab_add(s); + if (err) + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); + } + + while (alias_list) { + struct saved_alias *al = alias_list; + + alias_list = alias_list->next; + err = sysfs_slab_alias(al->s, al->name); + if (err) + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); + kfree(al); + } + + mutex_unlock(&slab_mutex); + return 0; +} + +__initcall(slab_sysfs_init); +#endif /* CONFIG_SYSFS */ + +#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) +static int slab_debugfs_show(struct seq_file *seq, void *v) +{ + struct loc_track *t = seq->private; + struct location *l; + unsigned long idx; + + idx = (unsigned long) t->idx; + if (idx < t->count) { + l = &t->loc[idx]; + + seq_printf(seq, "%7ld ", l->count); + + if (l->addr) + seq_printf(seq, "%pS", (void *)l->addr); + else + seq_puts(seq, ""); + + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + + if (l->sum_time != l->min_time) { + seq_printf(seq, " age=%ld/%llu/%ld", + l->min_time, div_u64(l->sum_time, l->count), + l->max_time); + } else + seq_printf(seq, " age=%ld", l->min_time); + + if (l->min_pid != l->max_pid) + seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid); + else + seq_printf(seq, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus))) + seq_printf(seq, " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) + seq_printf(seq, " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries, j; + + handle = READ_ONCE(l->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + seq_puts(seq, "\n"); + for (j = 0; j < nr_entries; j++) + seq_printf(seq, " %pS\n", (void *)entries[j]); + } + } +#endif + seq_puts(seq, "\n"); + } + + if (!idx && !t->count) + seq_puts(seq, "No data\n"); + + return 0; +} + +static void slab_debugfs_stop(struct seq_file *seq, void *v) +{ +} + +static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + t->idx = ++(*ppos); + if (*ppos <= t->count) + return ppos; + + return NULL; +} + +static int cmp_loc_by_count(const void *a, const void *b, const void *data) +{ + struct location *loc1 = (struct location *)a; + struct location *loc2 = (struct location *)b; + + if (loc1->count > loc2->count) + return -1; + else + return 1; +} + +static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + t->idx = *ppos; + return ppos; +} + +static const struct seq_operations slab_debugfs_sops = { + .start = slab_debugfs_start, + .next = slab_debugfs_next, + .stop = slab_debugfs_stop, + .show = slab_debugfs_show, +}; + +static int slab_debug_trace_open(struct inode *inode, struct file *filep) +{ + + struct kmem_cache_node *n; + enum track_item alloc; + int node; + struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, + sizeof(struct loc_track)); + struct kmem_cache *s = file_inode(filep)->i_private; + unsigned long *obj_map; + + if (!t) + return -ENOMEM; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) { + seq_release_private(inode, filep); + return -ENOMEM; + } + + if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) + alloc = TRACK_ALLOC; + else + alloc = TRACK_FREE; + + if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { + bitmap_free(obj_map); + seq_release_private(inode, filep); + return -ENOMEM; + } + + for_each_kmem_cache_node(s, node, n) { + unsigned long flags; + struct slab *slab; + + if (!atomic_long_read(&n->nr_slabs)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(slab, &n->partial, slab_list) + process_slab(t, s, slab, alloc, obj_map); + list_for_each_entry(slab, &n->full, slab_list) + process_slab(t, s, slab, alloc, obj_map); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + /* Sort locations by count */ + sort_r(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL, NULL); + + bitmap_free(obj_map); + return 0; +} + +static int slab_debug_trace_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct loc_track *t = seq->private; + + free_loc_track(t); + return seq_release_private(inode, file); +} + +static const struct file_operations slab_debugfs_fops = { + .open = slab_debug_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = slab_debug_trace_release, +}; + +static void debugfs_slab_add(struct kmem_cache *s) +{ + struct dentry *slab_cache_dir; + + if (unlikely(!slab_debugfs_root)) + return; + + slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); + + debugfs_create_file("alloc_traces", 0400, + slab_cache_dir, s, &slab_debugfs_fops); + + debugfs_create_file("free_traces", 0400, + slab_cache_dir, s, &slab_debugfs_fops); +} + +void debugfs_slab_release(struct kmem_cache *s) +{ + debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root)); +} + +static int __init slab_debugfs_init(void) +{ + struct kmem_cache *s; + + slab_debugfs_root = debugfs_create_dir("slab", NULL); + + list_for_each_entry(s, &slab_caches, list) + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + + return 0; + +} +__initcall(slab_debugfs_init); +#endif +/* + * The /proc/slabinfo ABI + */ +#ifdef CONFIG_SLUB_DEBUG +void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) +{ + unsigned long nr_slabs = 0; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); + nr_free += count_partial(n, count_free); + } + + sinfo->active_objs = nr_objs - nr_free; + sinfo->num_objs = nr_objs; + sinfo->active_slabs = nr_slabs; + sinfo->num_slabs = nr_slabs; + sinfo->objects_per_slab = oo_objects(s->oo); + sinfo->cache_order = oo_order(s->oo); +} + +void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) +{ +} + +ssize_t slabinfo_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + return -EIO; +} +#endif /* CONFIG_SLUB_DEBUG */ diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c new file mode 100644 index 000000000..46ae54211 --- /dev/null +++ b/mm/sparse-vmemmap.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Virtual Memory Map support + * + * (C) 2007 sgi. Christoph Lameter. + * + * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn, + * virt_to_page, page_address() to be implemented as a base offset + * calculation without memory access. + * + * However, virtual mappings need a page table and TLBs. Many Linux + * architectures already map their physical space using 1-1 mappings + * via TLBs. For those arches the virtual memory map is essentially + * for free if we use the same page size as the 1-1 mappings. In that + * case the overhead consists of a few additional pages that are + * allocated to create a view of memory for vmemmap. + * + * The architecture is expected to provide a vmemmap_populate() function + * to instantiate the mapping. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Allocate a block of memory to be used to back the virtual memory map + * or to back the page tables that are used to create the mapping. + * Uses the main allocators if they are available, else bootmem. + */ + +static void * __ref __earlyonly_bootmem_alloc(int node, + unsigned long size, + unsigned long align, + unsigned long goal) +{ + return memblock_alloc_try_nid_raw(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, node); +} + +void * __meminit vmemmap_alloc_block(unsigned long size, int node) +{ + /* If the main allocator is up use that, fallback to bootmem. */ + if (slab_is_available()) { + gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + int order = get_order(size); + static bool warned; + struct page *page; + + page = alloc_pages_node(node, gfp_mask, order); + if (page) + return page_address(page); + + if (!warned) { + warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL, + "vmemmap alloc failure: order:%u", order); + warned = true; + } + return NULL; + } else + return __earlyonly_bootmem_alloc(node, size, size, + __pa(MAX_DMA_ADDRESS)); +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap); + +/* need to make sure size is all the same during early stage */ +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap) +{ + void *ptr; + + if (altmap) + return altmap_alloc_block_buf(size, altmap); + + ptr = sparse_buffer_alloc(size); + if (!ptr) + ptr = vmemmap_alloc_block(size, node); + return ptr; +} + +static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) +{ + return altmap->base_pfn + altmap->reserve + altmap->alloc + + altmap->align; +} + +static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) +{ + unsigned long allocated = altmap->alloc + altmap->align; + + if (altmap->free > allocated) + return altmap->free - allocated; + return 0; +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap) +{ + unsigned long pfn, nr_pfns, nr_align; + + if (size & ~PAGE_MASK) { + pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", + __func__, size); + return NULL; + } + + pfn = vmem_altmap_next_pfn(altmap); + nr_pfns = size >> PAGE_SHIFT; + nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); + nr_align = ALIGN(pfn, nr_align) - pfn; + if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) + return NULL; + + altmap->alloc += nr_pfns; + altmap->align += nr_align; + pfn += nr_align; + + pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", + __func__, pfn, altmap->alloc, altmap->align, nr_pfns); + return __va(__pfn_to_phys(pfn)); +} + +void __meminit vmemmap_verify(pte_t *pte, int node, + unsigned long start, unsigned long end) +{ + unsigned long pfn = pte_pfn(*pte); + int actual_node = early_pfn_to_nid(pfn); + + if (node_distance(actual_node, node) > LOCAL_DISTANCE) + pr_warn_once("[%lx-%lx] potential offnode page_structs\n", + start, end - 1); +} + +pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p) + return NULL; + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + } + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) +{ + void *p = vmemmap_alloc_block(size, node); + + if (!p) + return NULL; + memset(p, 0, size); + + return p; +} + +pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) +{ + pud_t *pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + p4d_populate(&init_mm, p4d, p); + } + return p4d; +} + +pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); + if (!p) + return NULL; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return NULL; + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return NULL; + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return NULL; + pmd = vmemmap_pmd_populate(pud, addr, node); + if (!pmd) + return NULL; + pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse); + if (!pte) + return NULL; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static int __meminit vmemmap_populate_range(unsigned long start, + unsigned long end, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + unsigned long addr = start; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pte = vmemmap_populate_address(addr, node, altmap, reuse); + if (!pte) + return -ENOMEM; + } + + return 0; +} + +int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + return vmemmap_populate_range(start, end, node, altmap, NULL); +} + +/* + * For compound pages bigger than section size (e.g. x86 1G compound + * pages with 2M subsection size) fill the rest of sections as tail + * pages. + * + * Note that memremap_pages() resets @nr_range value and will increment + * it after each range successful onlining. Thus the value or @nr_range + * at section memmap populate corresponds to the in-progress range + * being onlined here. + */ +static bool __meminit reuse_compound_section(unsigned long start_pfn, + struct dev_pagemap *pgmap) +{ + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long offset = start_pfn - + PHYS_PFN(pgmap->ranges[pgmap->nr_range].start); + + return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION; +} + +static pte_t * __meminit compound_section_tail_page(unsigned long addr) +{ + pte_t *pte; + + addr -= PAGE_SIZE; + + /* + * Assuming sections are populated sequentially, the previous section's + * page data can be reused. + */ + pte = pte_offset_kernel(pmd_off_k(addr), addr); + if (!pte) + return NULL; + + return pte; +} + +static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + unsigned long size, addr; + pte_t *pte; + int rc; + + if (reuse_compound_section(start_pfn, pgmap)) { + pte = compound_section_tail_page(start); + if (!pte) + return -ENOMEM; + + /* + * Reuse the page that was populated in the prior iteration + * with just tail struct pages. + */ + return vmemmap_populate_range(start, end, node, NULL, + pte_page(*pte)); + } + + size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); + for (addr = start; addr < end; addr += size) { + unsigned long next, last = addr + size; + + /* Populate the head page vmemmap page */ + pte = vmemmap_populate_address(addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + /* Populate the tail pages vmemmap page */ + next = addr + PAGE_SIZE; + pte = vmemmap_populate_address(next, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + /* + * Reuse the previous page for the rest of tail pages + * See layout diagram in Documentation/mm/vmemmap_dedup.rst + */ + next += PAGE_SIZE; + rc = vmemmap_populate_range(next, last, node, NULL, + pte_page(*pte)); + if (rc) + return -ENOMEM; + } + + return 0; +} + +struct page * __meminit __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); + int r; + + if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) || + !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION))) + return NULL; + + if (is_power_of_2(sizeof(struct page)) && + pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap) + r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap); + else + r = vmemmap_populate(start, end, nid, altmap); + + if (r < 0) + return NULL; + + return pfn_to_page(pfn); +} diff --git a/mm/sparse.c b/mm/sparse.c new file mode 100644 index 000000000..05d1e7b6c --- /dev/null +++ b/mm/sparse.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * sparse memory mappings. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include + +/* + * Permanent SPARSEMEM data: + * + * 1) mem_section - memory sections, mem_map's for valid memory + */ +#ifdef CONFIG_SPARSEMEM_EXTREME +struct mem_section **mem_section; +#else +struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] + ____cacheline_internodealigned_in_smp; +#endif +EXPORT_SYMBOL(mem_section); + +#ifdef NODE_NOT_IN_PAGE_FLAGS +/* + * If we did not store the node number in the page then we have to + * do a lookup in the section_to_node_table in order to find which + * node the page belongs to. + */ +#if MAX_NUMNODES <= 256 +static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#else +static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; +#endif + +int page_to_nid(const struct page *page) +{ + return section_to_node_table[page_to_section(page)]; +} +EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} +#endif + +#ifdef CONFIG_SPARSEMEM_EXTREME +static noinline struct mem_section __ref *sparse_index_alloc(int nid) +{ + struct mem_section *section = NULL; + unsigned long array_size = SECTIONS_PER_ROOT * + sizeof(struct mem_section); + + if (slab_is_available()) { + section = kzalloc_node(array_size, GFP_KERNEL, nid); + } else { + section = memblock_alloc_node(array_size, SMP_CACHE_BYTES, + nid); + if (!section) + panic("%s: Failed to allocate %lu bytes nid=%d\n", + __func__, array_size, nid); + } + + return section; +} + +static int __meminit sparse_index_init(unsigned long section_nr, int nid) +{ + unsigned long root = SECTION_NR_TO_ROOT(section_nr); + struct mem_section *section; + + /* + * An existing section is possible in the sub-section hotplug + * case. First hot-add instantiates, follow-on hot-add reuses + * the existing section. + * + * The mem_hotplug_lock resolves the apparent race below. + */ + if (mem_section[root]) + return 0; + + section = sparse_index_alloc(nid); + if (!section) + return -ENOMEM; + + mem_section[root] = section; + + return 0; +} +#else /* !SPARSEMEM_EXTREME */ +static inline int sparse_index_init(unsigned long section_nr, int nid) +{ + return 0; +} +#endif + +/* + * During early boot, before section_mem_map is used for an actual + * mem_map, we use section_mem_map to store the section's NUMA + * node. This keeps us from having to use another data structure. The + * node information is cleared just before we store the real mem_map. + */ +static inline unsigned long sparse_encode_early_nid(int nid) +{ + return ((unsigned long)nid << SECTION_NID_SHIFT); +} + +static inline int sparse_early_nid(struct mem_section *section) +{ + return (section->section_mem_map >> SECTION_NID_SHIFT); +} + +/* Validate the physical addressing limitations of the model */ +static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); + + /* + * Sanity checks - do not allow an architecture to pass + * in larger pfns than the maximum scope of sparsemem: + */ + if (*start_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *start_pfn = max_sparsemem_pfn; + *end_pfn = max_sparsemem_pfn; + } else if (*end_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *end_pfn = max_sparsemem_pfn; + } +} + +/* + * There are a number of times that we loop over NR_MEM_SECTIONS, + * looking for section_present() on each. But, when we have very + * large physical address spaces, NR_MEM_SECTIONS can also be + * very large which makes the loops quite long. + * + * Keeping track of this gives us an easy way to break out of + * those loops early. + */ +unsigned long __highest_present_section_nr; +static void __section_mark_present(struct mem_section *ms, + unsigned long section_nr) +{ + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} + +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start-1); \ + ((section_nr != -1) && \ + (section_nr <= __highest_present_section_nr)); \ + section_nr = next_present_section_nr(section_nr)) + +static inline unsigned long first_present_section_nr(void) +{ + return next_present_section_nr(-1); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) +{ + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); + + bitmap_set(map, idx, end - idx + 1); +} + +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + unsigned long nr, start_sec = pfn_to_section_nr(pfn); + + if (!nr_pages) + return; + + for (nr = start_sec; nr <= end_sec; nr++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(nr); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} +#else +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ +} +#endif + +/* Record a memory area against a node. */ +static void __init memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + +#ifdef CONFIG_SPARSEMEM_EXTREME + if (unlikely(!mem_section)) { + unsigned long size, align; + + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; + align = 1 << (INTERNODE_CACHE_SHIFT); + mem_section = memblock_alloc(size, align); + if (!mem_section) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, size, align); + } +#endif + + start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { + unsigned long section = pfn_to_section_nr(pfn); + struct mem_section *ms; + + sparse_index_init(section, nid); + set_section_nid(section, nid); + + ms = __nr_to_section(section); + if (!ms->section_mem_map) { + ms->section_mem_map = sparse_encode_early_nid(nid) | + SECTION_IS_ONLINE; + __section_mark_present(ms, section); + } + } +} + +/* + * Mark all memblocks as present using memory_present(). + * This is a convenience function that is useful to mark all of the systems + * memory as present during initialization. + */ +static void __init memblocks_present(void) +{ + unsigned long start, end; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) + memory_present(nid, start, end); +} + +/* + * Subtle, we encode the real pfn into the mem_map such that + * the identity pfn - section_mem_map will return the actual + * physical page frame number. + */ +static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) +{ + unsigned long coded_mem_map = + (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT); + BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); + return coded_mem_map; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Decode mem_map from the coded memmap + */ +struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) +{ + /* mask off the extra low bits of information */ + coded_mem_map &= SECTION_MAP_MASK; + return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static void __meminit sparse_init_one_section(struct mem_section *ms, + unsigned long pnum, struct page *mem_map, + struct mem_section_usage *usage, unsigned long flags) +{ + ms->section_mem_map &= ~SECTION_MAP_MASK; + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) + | SECTION_HAS_MEM_MAP | flags; + ms->usage = usage; +} + +static unsigned long usemap_size(void) +{ + return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); +} + +size_t mem_section_usage_size(void) +{ + return sizeof(struct mem_section_usage) + usemap_size(); +} + +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) +{ +#ifndef CONFIG_NUMA + VM_BUG_ON(pgdat != &contig_page_data); + return __pa_symbol(&contig_page_data); +#else + return __pa(pgdat); +#endif +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct mem_section_usage * __init +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long size) +{ + struct mem_section_usage *usage; + unsigned long goal, limit; + int nid; + /* + * A page may contain usemaps for other sections preventing the + * page being freed and making a section unremovable while + * other sections referencing the usemap remain active. Similarly, + * a pgdat can prevent a section being removed. If section A + * contains a pgdat and section B contains the usemap, both + * sections become inter-dependent. This allocates usemaps + * from the same section as the pgdat where possible to avoid + * this problem. + */ + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); + if (!usage && limit) { + limit = 0; + goto again; + } + return usage; +} + +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) +{ + unsigned long usemap_snr, pgdat_snr; + static unsigned long old_usemap_snr; + static unsigned long old_pgdat_snr; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + + /* First call */ + if (!old_usemap_snr) { + old_usemap_snr = NR_MEM_SECTIONS; + old_pgdat_snr = NR_MEM_SECTIONS; + } + + usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) + return; + + if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) + /* skip redundant message */ + return; + + old_usemap_snr = usemap_snr; + old_pgdat_snr = pgdat_snr; + + usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); + if (usemap_nid != nid) { + pr_info("node %d must be removed before remove section %ld\n", + nid, usemap_snr); + return; + } + /* + * There is a circular dependency. + * Some platforms allow un-removable section because they will just + * gather other removable sections for dynamic partitioning. + * Just notify un-removable section's number here. + */ + pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", + usemap_snr, pgdat_snr, nid); +} +#else +static struct mem_section_usage * __init +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long size) +{ + return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); +} + +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static unsigned long __init section_map_size(void) +{ + return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE); +} + +#else +static unsigned long __init section_map_size(void) +{ + return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); +} + +struct page __init *__populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long size = section_map_size(); + struct page *map = sparse_buffer_alloc(size); + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); + + if (map) + return map; + + map = memmap_alloc(size, size, addr, nid, false); + if (!map) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", + __func__, size, PAGE_SIZE, nid, &addr); + + return map; +} +#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ + +static void *sparsemap_buf __meminitdata; +static void *sparsemap_buf_end __meminitdata; + +static inline void __meminit sparse_buffer_free(unsigned long size) +{ + WARN_ON(!sparsemap_buf || size == 0); + memblock_free(sparsemap_buf, size); +} + +static void __init sparse_buffer_init(unsigned long size, int nid) +{ + phys_addr_t addr = __pa(MAX_DMA_ADDRESS); + WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ + /* + * Pre-allocated buffer is mainly used by __populate_section_memmap + * and we want it to be properly aligned to the section size - this is + * especially the case for VMEMMAP which maps memmap to PMDs + */ + sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true); + sparsemap_buf_end = sparsemap_buf + size; +} + +static void __init sparse_buffer_fini(void) +{ + unsigned long size = sparsemap_buf_end - sparsemap_buf; + + if (sparsemap_buf && size > 0) + sparse_buffer_free(size); + sparsemap_buf = NULL; +} + +void * __meminit sparse_buffer_alloc(unsigned long size) +{ + void *ptr = NULL; + + if (sparsemap_buf) { + ptr = (void *) roundup((unsigned long)sparsemap_buf, size); + if (ptr + size > sparsemap_buf_end) + ptr = NULL; + else { + /* Free redundant aligned space */ + if ((unsigned long)(ptr - sparsemap_buf) > 0) + sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); + sparsemap_buf = ptr + size; + } + } + return ptr; +} + +void __weak __meminit vmemmap_populate_print_last(void) +{ +} + +/* + * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end) + * And number of present sections in this node is map_count. + */ +static void __init sparse_init_nid(int nid, unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count) +{ + struct mem_section_usage *usage; + unsigned long pnum; + struct page *map; + + usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), + mem_section_usage_size() * map_count); + if (!usage) { + pr_err("%s: node[%d] usemap allocation failed", __func__, nid); + goto failed; + } + sparse_buffer_init(map_count * section_map_size(), nid); + for_each_present_section_nr(pnum_begin, pnum) { + unsigned long pfn = section_nr_to_pfn(pnum); + + if (pnum >= pnum_end) + break; + + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, + nid, NULL, NULL); + if (!map) { + pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", + __func__, nid); + pnum_begin = pnum; + sparse_buffer_fini(); + goto failed; + } + check_usemap_section_nr(nid, usage); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage, + SECTION_IS_EARLY); + usage = (void *) usage + mem_section_usage_size(); + } + sparse_buffer_fini(); + return; +failed: + /* We failed to allocate, mark all the following pnums as not present */ + for_each_present_section_nr(pnum_begin, pnum) { + struct mem_section *ms; + + if (pnum >= pnum_end) + break; + ms = __nr_to_section(pnum); + ms->section_mem_map = 0; + } +} + +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init sparse_init(void) +{ + unsigned long pnum_end, pnum_begin, map_count = 1; + int nid_begin; + + memblocks_present(); + + pnum_begin = first_present_section_nr(); + nid_begin = sparse_early_nid(__nr_to_section(pnum_begin)); + + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + + for_each_present_section_nr(pnum_begin + 1, pnum_end) { + int nid = sparse_early_nid(__nr_to_section(pnum_end)); + + if (nid == nid_begin) { + map_count++; + continue; + } + /* Init node with sections in range [pnum_begin, pnum_end) */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + nid_begin = nid; + pnum_begin = pnum_end; + map_count = 1; + } + /* cover the last node */ + sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count); + vmemmap_populate_print_last(); +} + +#ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms; + + /* onlining code should never touch invalid ranges */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +/* Mark all memory sections within the pfn range as offline */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(pfn); + struct mem_section *ms; + + /* + * TODO this needs some double checking. Offlining code makes + * sure to check pfn_valid but those checks might be just bogus + */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); +} + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); + + vmemmap_free(start, end, altmap); +} +static void free_map_bootmem(struct page *memmap) +{ + unsigned long start = (unsigned long)memmap; + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + + vmemmap_free(start, end, NULL); +} + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; + + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} +#else +struct page * __meminit populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + return kvmalloc_node(array_size(sizeof(struct page), + PAGES_PER_SECTION), GFP_KERNEL, nid); +} + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + kvfree(pfn_to_page(pfn)); +} + +static void free_map_bootmem(struct page *memmap) +{ + unsigned long maps_section_nr, removing_section_nr, i; + unsigned long magic, nr_pages; + struct page *page = virt_to_page(memmap); + + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + for (i = 0; i < nr_pages; i++, page++) { + magic = page->index; + + BUG_ON(magic == NODE_INFO); + + maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); + removing_section_nr = page_private(page); + + /* + * When this function is called, the removing section is + * logical offlined state. This means all pages are isolated + * from page allocator. If removing section's memmap is placed + * on the same section, it must not be freed. + * If it is freed, page allocator may allocate it which will + * be removed physically soon. + */ + if (maps_section_nr != removing_section_nr) + put_page_bootmem(page); + } +} + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return true; +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * To deactivate a memory region, there are 3 cases to handle across + * two configurations (SPARSEMEM_VMEMMAP={y,n}): + * + * 1. deactivation of a partial hot-added section (only possible in + * the SPARSEMEM_VMEMMAP=y case). + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + * + * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified + */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + bool empty; + + if (clear_subsection_map(pfn, nr_pages)) + return; + + empty = is_subsection_map_empty(ms); + if (empty) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + /* + * Mark the section invalid so that valid_section() + * return false. This prevents code from dereferencing + * ms->usage array. + */ + ms->section_mem_map &= ~SECTION_HAS_MEM_MAP; + + /* + * When removing an early section, the usage map is kept (as the + * usage maps of other sections fall into the same page). It + * will be re-used when re-adding the section - which is then no + * longer an early section. If the usage map is PageReserved, it + * was allocated during boot. + */ + if (!PageReserved(virt_to_page(ms->usage))) { + kfree_rcu(ms->usage, rcu); + WRITE_ONCE(ms->usage, NULL); + } + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + } + + /* + * The memmap of early sections is always fully populated. See + * section_activate() and pfn_valid() . + */ + if (!section_is_early) + depopulate_section_memmap(pfn, nr_pages, altmap); + else if (memmap) + free_map_bootmem(memmap); + + if (empty) + ms->section_mem_map = (unsigned long)NULL; +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + struct page *memmap; + int rc = 0; + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + + rc = fill_subsection_map(pfn, nr_pages); + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + + return memmap; +} + +/** + * sparse_add_section - add a memory section, or populate an existing one + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section + * @altmap: alternate pfns to allocate the memmap backing store + * @pgmap: alternate compound page geometry for devmap mappings + * + * This is only intended for hotplug. + * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. + */ +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) +{ + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + struct page *memmap; + int ret; + + ret = sparse_index_init(section_nr, nid); + if (ret < 0) + return ret; + + memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); + + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + page_init_poison(memmap, sizeof(struct page) * nr_pages); + + ms = __nr_to_section(section_nr); + set_section_nid(section_nr, nid); + __section_mark_present(ms, section_nr); + + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_page(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; +} + +void sparse_remove_section(struct mem_section *ms, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, + struct vmem_altmap *altmap) +{ + clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, + nr_pages - map_offset); + section_deactivate(pfn, nr_pages, altmap); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c new file mode 100644 index 000000000..955930f41 --- /dev/null +++ b/mm/swap.c @@ -0,0 +1,1127 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/swap.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * This file contains the default values for the operation of the + * Linux VM subsystem. Fine-tuning documentation can be found in + * Documentation/admin-guide/sysctl/vm.rst. + * Started 18.12.91 + * Swap aging added 23.2.95, Stephen Tweedie. + * Buffermem limits added 12.3.98, Rik van Riel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define CREATE_TRACE_POINTS +#include + +/* How many pages do we try to swap or page in/out together? */ +int page_cluster; + +/* Protecting only lru_rotate.fbatch which requires disabling interrupts */ +struct lru_rotate { + local_lock_t lock; + struct folio_batch fbatch; +}; +static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +/* + * The following folio batches are grouped together because they are protected + * by disabling preemption (and interrupts remain enabled). + */ +struct cpu_fbatches { + local_lock_t lock; + struct folio_batch lru_add; + struct folio_batch lru_deactivate_file; + struct folio_batch lru_deactivate; + struct folio_batch lru_lazyfree; +#ifdef CONFIG_SMP + struct folio_batch activate; +#endif +}; +static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +/* + * This path almost never happens for VM activity - pages are normally freed + * via pagevecs. But it gets used by networking - and for compound pages. + */ +static void __page_cache_release(struct folio *folio) +{ + if (folio_test_lru(folio)) { + struct lruvec *lruvec; + unsigned long flags; + + lruvec = folio_lruvec_lock_irqsave(folio, &flags); + lruvec_del_folio(lruvec, folio); + __folio_clear_lru_flags(folio); + unlock_page_lruvec_irqrestore(lruvec, flags); + } + /* See comment on folio_test_mlocked in release_pages() */ + if (unlikely(folio_test_mlocked(folio))) { + long nr_pages = folio_nr_pages(folio); + + __folio_clear_mlocked(folio); + zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); + } +} + +static void __folio_put_small(struct folio *folio) +{ + __page_cache_release(folio); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, 0); +} + +static void __folio_put_large(struct folio *folio) +{ + /* + * __page_cache_release() is supposed to be called for thp, not for + * hugetlb. This is because hugetlb page does never have PageLRU set + * (it's never listed to any LRU lists) and no memcg routines should + * be called for hugetlb (it has a separate hugetlb_cgroup.) + */ + if (!folio_test_hugetlb(folio)) + __page_cache_release(folio); + destroy_large_folio(folio); +} + +void __folio_put(struct folio *folio) +{ + if (unlikely(folio_is_zone_device(folio))) + free_zone_device_page(&folio->page); + else if (unlikely(folio_test_large(folio))) + __folio_put_large(folio); + else + __folio_put_small(folio); +} +EXPORT_SYMBOL(__folio_put); + +/** + * put_pages_list() - release a list of pages + * @pages: list of pages threaded on page->lru + * + * Release a list of pages which are strung together on page.lru. + */ +void put_pages_list(struct list_head *pages) +{ + struct folio *folio, *next; + + list_for_each_entry_safe(folio, next, pages, lru) { + if (!folio_put_testzero(folio)) { + list_del(&folio->lru); + continue; + } + if (folio_test_large(folio)) { + list_del(&folio->lru); + __folio_put_large(folio); + continue; + } + /* LRU flag must be clear because it's passed using the lru */ + } + + free_unref_page_list(pages); + INIT_LIST_HEAD(pages); +} +EXPORT_SYMBOL(put_pages_list); + +/* + * get_kernel_pages() - pin kernel pages in memory + * @kiov: An array of struct kvec structures + * @nr_segs: number of segments to pin + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_segs long. + * + * Returns number of pages pinned. This may be fewer than the number requested. + * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0. + * Each page returned must be released with a put_page() call when it is + * finished with. + */ +int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, + struct page **pages) +{ + int seg; + + for (seg = 0; seg < nr_segs; seg++) { + if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) + return seg; + + pages[seg] = kmap_to_page(kiov[seg].iov_base); + get_page(pages[seg]); + } + + return seg; +} +EXPORT_SYMBOL_GPL(get_kernel_pages); + +typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); + +static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) +{ + int was_unevictable = folio_test_clear_unevictable(folio); + long nr_pages = folio_nr_pages(folio); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + /* + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests the mlocked flag, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because __munlock_page() only clears the mlocked flag + * while the LRU lock is held. + * + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear the mlocked flag after + * folio_put_testzero() has excluded any other users of the folio.) + */ + if (folio_evictable(folio)) { + if (was_unevictable) + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } else { + folio_clear_active(folio); + folio_set_unevictable(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; + if (!was_unevictable) + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + } + + lruvec_add_folio(lruvec, folio); + trace_mm_lru_insertion(folio); +} + +static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) +{ + int i; + struct lruvec *lruvec = NULL; + unsigned long flags = 0; + + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + + /* block memcg migration while the folio moves between lru */ + if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) + continue; + + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + move_fn(lruvec, folio); + + folio_set_lru(folio); + } + + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + folios_put(fbatch->folios, folio_batch_count(fbatch)); + folio_batch_init(fbatch); +} + +static void folio_batch_add_and_move(struct folio_batch *fbatch, + struct folio *folio, move_fn_t move_fn) +{ + if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && + !lru_cache_disabled()) + return; + folio_batch_move_lru(fbatch, move_fn); +} + +static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) +{ + if (!folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, folio_nr_pages(folio)); + } +} + +/* + * Writeback is about to end against a folio which has been marked for + * immediate reclaim. If it still appears to be reclaimable, move it + * to the tail of the inactive list. + * + * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. + */ +void folio_rotate_reclaimable(struct folio *folio) +{ + if (!folio_test_locked(folio) && !folio_test_dirty(folio) && + !folio_test_unevictable(folio) && folio_test_lru(folio)) { + struct folio_batch *fbatch; + unsigned long flags; + + folio_get(folio); + local_lock_irqsave(&lru_rotate.lock, flags); + fbatch = this_cpu_ptr(&lru_rotate.fbatch); + folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); + local_unlock_irqrestore(&lru_rotate.lock, flags); + } +} + +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) +{ + do { + unsigned long lrusize; + + /* + * Hold lruvec->lru_lock is safe here, since + * 1) The pinned lruvec in reclaim, or + * 2) From a pre-LRU page during refault (which also holds the + * rcu lock, so would be safe even if the page was on the LRU + * and could move simultaneously to a new lruvec). + */ + spin_lock_irq(&lruvec->lru_lock); + /* Record cost event */ + if (file) + lruvec->file_cost += nr_pages; + else + lruvec->anon_cost += nr_pages; + + /* + * Decay previous events + * + * Because workloads change over time (and to avoid + * overflow) we keep these statistics as a floating + * average, which ends up weighing recent refaults + * more than old ones. + */ + lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + + lruvec_page_state(lruvec, NR_ACTIVE_FILE); + + if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { + lruvec->file_cost /= 2; + lruvec->anon_cost /= 2; + } + spin_unlock_irq(&lruvec->lru_lock); + } while ((lruvec = parent_lruvec(lruvec))); +} + +void lru_note_cost_folio(struct folio *folio) +{ + lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), + folio_nr_pages(folio)); +} + +static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) +{ + if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); + + lruvec_del_folio(lruvec, folio); + folio_set_active(folio); + lruvec_add_folio(lruvec, folio); + trace_mm_lru_activate(folio); + + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, + nr_pages); + } +} + +#ifdef CONFIG_SMP +static void folio_activate_drain(int cpu) +{ + struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); + + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, folio_activate_fn); +} + +void folio_activate(struct folio *folio) +{ + if (folio_test_lru(folio) && !folio_test_active(folio) && + !folio_test_unevictable(folio)) { + struct folio_batch *fbatch; + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.activate); + folio_batch_add_and_move(fbatch, folio, folio_activate_fn); + local_unlock(&cpu_fbatches.lock); + } +} + +#else +static inline void folio_activate_drain(int cpu) +{ +} + +void folio_activate(struct folio *folio) +{ + struct lruvec *lruvec; + + if (folio_test_clear_lru(folio)) { + lruvec = folio_lruvec_lock_irq(folio); + folio_activate_fn(lruvec, folio); + unlock_page_lruvec_irq(lruvec); + folio_set_lru(folio); + } +} +#endif + +static void __lru_cache_activate_folio(struct folio *folio) +{ + struct folio_batch *fbatch; + int i; + + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); + + /* + * Search backwards on the optimistic assumption that the folio being + * activated has just been added to this batch. Note that only + * the local batch is examined as a !LRU folio could be in the + * process of being released, reclaimed, migrated or on a remote + * batch that is currently being drained. Furthermore, marking + * a remote batch's folio active potentially hits a race where + * a folio is marked active just after it is added to the inactive + * list causing accounting errors and BUG_ON checks to trigger. + */ + for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { + struct folio *batch_folio = fbatch->folios[i]; + + if (batch_folio == folio) { + folio_set_active(folio); + break; + } + } + + local_unlock(&cpu_fbatches.lock); +} + +#ifdef CONFIG_LRU_GEN +static void folio_inc_refs(struct folio *folio) +{ + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + if (folio_test_unevictable(folio)) + return; + + if (!folio_test_referenced(folio)) { + folio_set_referenced(folio); + return; + } + + if (!folio_test_workingset(folio)) { + folio_set_workingset(folio); + return; + } + + /* see the comment on MAX_NR_TIERS */ + do { + new_flags = old_flags & LRU_REFS_MASK; + if (new_flags == LRU_REFS_MASK) + break; + + new_flags += BIT(LRU_REFS_PGOFF); + new_flags |= old_flags & ~LRU_REFS_MASK; + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); +} +#else +static void folio_inc_refs(struct folio *folio) +{ +} +#endif /* CONFIG_LRU_GEN */ + +/* + * Mark a page as having seen activity. + * + * inactive,unreferenced -> inactive,referenced + * inactive,referenced -> active,unreferenced + * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). + */ +void folio_mark_accessed(struct folio *folio) +{ + if (lru_gen_enabled()) { + folio_inc_refs(folio); + return; + } + + if (!folio_test_referenced(folio)) { + folio_set_referenced(folio); + } else if (folio_test_unevictable(folio)) { + /* + * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, + * this list is never rotated or maintained, so marking an + * unevictable page accessed has no effect. + */ + } else if (!folio_test_active(folio)) { + /* + * If the folio is on the LRU, queue it for activation via + * cpu_fbatches.activate. Otherwise, assume the folio is in a + * folio_batch, mark it active and it'll be moved to the active + * LRU on the next drain. + */ + if (folio_test_lru(folio)) + folio_activate(folio); + else + __lru_cache_activate_folio(folio); + folio_clear_referenced(folio); + workingset_activation(folio); + } + if (folio_test_idle(folio)) + folio_clear_idle(folio); +} +EXPORT_SYMBOL(folio_mark_accessed); + +/** + * folio_add_lru - Add a folio to an LRU list. + * @folio: The folio to be added to the LRU. + * + * Queue the folio for addition to the LRU. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * folio_batch is drained. This gives a chance for the caller of folio_add_lru() + * have the folio added to the active list using folio_mark_accessed(). + */ +void folio_add_lru(struct folio *folio) +{ + struct folio_batch *fbatch; + + VM_BUG_ON_FOLIO(folio_test_active(folio) && + folio_test_unevictable(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + /* see the comment in lru_gen_add_folio() */ + if (lru_gen_enabled() && !folio_test_unevictable(folio) && + lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) + folio_set_active(folio); + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); + folio_batch_add_and_move(fbatch, folio, lru_add_fn); + local_unlock(&cpu_fbatches.lock); +} +EXPORT_SYMBOL(folio_add_lru); + +/** + * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * @folio: The folio to be added to the LRU. + * @vma: VMA in which the folio is mapped. + * + * If the VMA is mlocked, @folio is added to the unevictable list. + * Otherwise, it is treated the same way as folio_add_lru(). + */ +void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) +{ + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) + mlock_new_page(&folio->page); + else + folio_add_lru(folio); +} + +/* + * If the folio cannot be invalidated, it is moved to the + * inactive list to speed up its reclaim. It is moved to the + * head of the list, rather than the tail, to give the flusher + * threads some time to write it out, as this is much more + * effective than the single-page writeout from reclaim. + * + * If the folio isn't mapped and dirty/writeback, the folio + * could be reclaimed asap using the reclaim flag. + * + * 1. active, mapped folio -> none + * 2. active, dirty/writeback folio -> inactive, head, reclaim + * 3. inactive, mapped folio -> none + * 4. inactive, dirty/writeback folio -> inactive, head, reclaim + * 5. inactive, clean -> inactive, tail + * 6. Others -> none + * + * In 4, it moves to the head of the inactive list so the folio is + * written out by flusher threads as this is much more efficient + * than the single-page writeout from reclaim. + */ +static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) +{ + bool active = folio_test_active(folio); + long nr_pages = folio_nr_pages(folio); + + if (folio_test_unevictable(folio)) + return; + + /* Some processes are using the folio */ + if (folio_mapped(folio)) + return; + + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + + if (folio_test_writeback(folio) || folio_test_dirty(folio)) { + /* + * Setting the reclaim flag could race with + * folio_end_writeback() and confuse readahead. But the + * race window is _really_ small and it's not a critical + * problem. + */ + lruvec_add_folio(lruvec, folio); + folio_set_reclaim(folio); + } else { + /* + * The folio's writeback ended while it was in the batch. + * We move that folio to the tail of the inactive list. + */ + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, nr_pages); + } + + if (active) { + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); + } +} + +static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) +{ + if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { + long nr_pages = folio_nr_pages(folio); + + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + lruvec_add_folio(lruvec, folio); + + __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); + } +} + +static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) +{ + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); + + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + /* + * Lazyfree folios are clean anonymous folios. They have + * the swapbacked flag cleared, to distinguish them from normal + * anonymous folios + */ + folio_clear_swapbacked(folio); + lruvec_add_folio(lruvec, folio); + + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, + nr_pages); + } +} + +/* + * Drain pages out of the cpu's folio_batch. + * Either "cpu" is the current CPU, and preemption has already been + * disabled; or "cpu" is being hot-unplugged, and is already dead. + */ +void lru_add_drain_cpu(int cpu) +{ + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); + struct folio_batch *fbatch = &fbatches->lru_add; + + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_add_fn); + + fbatch = &per_cpu(lru_rotate.fbatch, cpu); + /* Disabling interrupts below acts as a compiler barrier. */ + if (data_race(folio_batch_count(fbatch))) { + unsigned long flags; + + /* No harm done if a racing interrupt already did this */ + local_lock_irqsave(&lru_rotate.lock, flags); + folio_batch_move_lru(fbatch, lru_move_tail_fn); + local_unlock_irqrestore(&lru_rotate.lock, flags); + } + + fbatch = &fbatches->lru_deactivate_file; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_deactivate_file_fn); + + fbatch = &fbatches->lru_deactivate; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_deactivate_fn); + + fbatch = &fbatches->lru_lazyfree; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_lazyfree_fn); + + folio_activate_drain(cpu); +} + +/** + * deactivate_file_folio() - Deactivate a file folio. + * @folio: Folio to deactivate. + * + * This function hints to the VM that @folio is a good reclaim candidate, + * for example if its invalidation fails due to the folio being dirty + * or under writeback. + * + * Context: Caller holds a reference on the folio. + */ +void deactivate_file_folio(struct folio *folio) +{ + struct folio_batch *fbatch; + + /* Deactivating an unevictable folio will not accelerate reclaim */ + if (folio_test_unevictable(folio)) + return; + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); + folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); + local_unlock(&cpu_fbatches.lock); +} + +/* + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + struct folio *folio = page_folio(page); + + if (folio_test_lru(folio) && !folio_test_unevictable(folio) && + (folio_test_active(folio) || lru_gen_enabled())) { + struct folio_batch *fbatch; + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); + folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); + local_unlock(&cpu_fbatches.lock); + } +} + +/** + * mark_page_lazyfree - make an anon page lazyfree + * @page: page to deactivate + * + * mark_page_lazyfree() moves @page to the inactive file list. + * This is done to accelerate the reclaim of @page. + */ +void mark_page_lazyfree(struct page *page) +{ + struct folio *folio = page_folio(page); + + if (folio_test_lru(folio) && folio_test_anon(folio) && + folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && + !folio_test_unevictable(folio)) { + struct folio_batch *fbatch; + + folio_get(folio); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); + folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); + local_unlock(&cpu_fbatches.lock); + } +} + +void lru_add_drain(void) +{ + local_lock(&cpu_fbatches.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&cpu_fbatches.lock); + mlock_page_drain_local(); +} + +/* + * It's called from per-cpu workqueue context in SMP case so + * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on + * the same cpu. It shouldn't be a problem in !SMP case since + * the core is only one and the locks will disable preemption. + */ +static void lru_add_and_bh_lrus_drain(void) +{ + local_lock(&cpu_fbatches.lock); + lru_add_drain_cpu(smp_processor_id()); + local_unlock(&cpu_fbatches.lock); + invalidate_bh_lrus_cpu(); + mlock_page_drain_local(); +} + +void lru_add_drain_cpu_zone(struct zone *zone) +{ + local_lock(&cpu_fbatches.lock); + lru_add_drain_cpu(smp_processor_id()); + drain_local_pages(zone); + local_unlock(&cpu_fbatches.lock); + mlock_page_drain_local(); +} + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + +static void lru_add_drain_per_cpu(struct work_struct *dummy) +{ + lru_add_and_bh_lrus_drain(); +} + +static bool cpu_needs_drain(unsigned int cpu) +{ + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); + + /* Check these in order of likelihood that they're not zero */ + return folio_batch_count(&fbatches->lru_add) || + data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || + folio_batch_count(&fbatches->lru_deactivate_file) || + folio_batch_count(&fbatches->lru_deactivate) || + folio_batch_count(&fbatches->lru_lazyfree) || + folio_batch_count(&fbatches->activate) || + need_mlock_page_drain(cpu) || + has_bh_in_lru(cpu, NULL); +} + +/* + * Doesn't need any cpu hotplug locking because we do rely on per-cpu + * kworkers being shut down before our page_alloc_cpu_dead callback is + * executed on the offlined cpu. + * Calling this function with cpu hotplug locks held can actually lead + * to obscure indirect dependencies via WQ context. + */ +static inline void __lru_add_drain_all(bool force_all_cpus) +{ + /* + * lru_drain_gen - Global pages generation number + * + * (A) Definition: global lru_drain_gen = x implies that all generations + * 0 < n <= x are already *scheduled* for draining. + * + * This is an optimization for the highly-contended use case where a + * user space workload keeps constantly generating a flow of pages for + * each CPU. + */ + static unsigned int lru_drain_gen; + static struct cpumask has_work; + static DEFINE_MUTEX(lock); + unsigned cpu, this_gen; + + /* + * Make sure nobody triggers this path before mm_percpu_wq is fully + * initialized. + */ + if (WARN_ON(!mm_percpu_wq)) + return; + + /* + * Guarantee folio_batch counter stores visible by this CPU + * are visible to other CPUs before loading the current drain + * generation. + */ + smp_mb(); + + /* + * (B) Locally cache global LRU draining generation number + * + * The read barrier ensures that the counter is loaded before the mutex + * is taken. It pairs with smp_mb() inside the mutex critical section + * at (D). + */ + this_gen = smp_load_acquire(&lru_drain_gen); + + mutex_lock(&lock); + + /* + * (C) Exit the draining operation if a newer generation, from another + * lru_add_drain_all(), was already scheduled for draining. Check (A). + */ + if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) + goto done; + + /* + * (D) Increment global generation number + * + * Pairs with smp_load_acquire() at (B), outside of the critical + * section. Use a full memory barrier to guarantee that the + * new global drain generation number is stored before loading + * folio_batch counters. + * + * This pairing must be done here, before the for_each_online_cpu loop + * below which drains the page vectors. + * + * Let x, y, and z represent some system CPU numbers, where x < y < z. + * Assume CPU #z is in the middle of the for_each_online_cpu loop + * below and has already reached CPU #y's per-cpu data. CPU #x comes + * along, adds some pages to its per-cpu vectors, then calls + * lru_add_drain_all(). + * + * If the paired barrier is done at any later step, e.g. after the + * loop, CPU #x will just exit at (C) and miss flushing out all of its + * added pages. + */ + WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); + smp_mb(); + + cpumask_clear(&has_work); + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + if (cpu_needs_drain(cpu)) { + INIT_WORK(work, lru_add_drain_per_cpu); + queue_work_on(cpu, mm_percpu_wq, work); + __cpumask_set_cpu(cpu, &has_work); + } + } + + for_each_cpu(cpu, &has_work) + flush_work(&per_cpu(lru_add_drain_work, cpu)); + +done: + mutex_unlock(&lock); +} + +void lru_add_drain_all(void) +{ + __lru_add_drain_all(false); +} +#else +void lru_add_drain_all(void) +{ + lru_add_drain(); +} +#endif /* CONFIG_SMP */ + +atomic_t lru_disable_count = ATOMIC_INIT(0); + +/* + * lru_cache_disable() needs to be called before we start compiling + * a list of pages to be migrated using isolate_lru_page(). + * It drains pages on LRU cache and then disable on all cpus until + * lru_cache_enable is called. + * + * Must be paired with a call to lru_cache_enable(). + */ +void lru_cache_disable(void) +{ + atomic_inc(&lru_disable_count); + /* + * Readers of lru_disable_count are protected by either disabling + * preemption or rcu_read_lock: + * + * preempt_disable, local_irq_disable [bh_lru_lock()] + * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] + * preempt_disable [local_lock !CONFIG_PREEMPT_RT] + * + * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on + * preempt_disable() regions of code. So any CPU which sees + * lru_disable_count = 0 will have exited the critical + * section when synchronize_rcu() returns. + */ + synchronize_rcu_expedited(); +#ifdef CONFIG_SMP + __lru_add_drain_all(true); +#else + lru_add_and_bh_lrus_drain(); +#endif +} + +/** + * release_pages - batched put_page() + * @pages: array of pages to release + * @nr: number of pages + * + * Decrement the reference count on all the pages in @pages. If it + * fell to zero, remove the page from the LRU and free it. + */ +void release_pages(struct page **pages, int nr) +{ + int i; + LIST_HEAD(pages_to_free); + struct lruvec *lruvec = NULL; + unsigned long flags = 0; + unsigned int lock_batch; + + for (i = 0; i < nr; i++) { + struct folio *folio = page_folio(pages[i]); + + /* + * Make sure the IRQ-safe lock-holding time does not get + * excessive with a continuous string of pages from the + * same lruvec. The lock is held only if lruvec != NULL. + */ + if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; + } + + if (is_huge_zero_page(&folio->page)) + continue; + + if (folio_is_zone_device(folio)) { + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; + } + if (put_devmap_managed_page(&folio->page)) + continue; + if (folio_put_testzero(folio)) + free_zone_device_page(&folio->page); + continue; + } + + if (!folio_put_testzero(folio)) + continue; + + if (folio_test_large(folio)) { + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; + } + __folio_put_large(folio); + continue; + } + + if (folio_test_lru(folio)) { + struct lruvec *prev_lruvec = lruvec; + + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, + &flags); + if (prev_lruvec != lruvec) + lock_batch = 0; + + lruvec_del_folio(lruvec, folio); + __folio_clear_lru_flags(folio); + } + + /* + * In rare cases, when truncation or holepunching raced with + * munlock after VM_LOCKED was cleared, Mlocked may still be + * found set here. This does not indicate a problem, unless + * "unevictable_pgs_cleared" appears worryingly large. + */ + if (unlikely(folio_test_mlocked(folio))) { + __folio_clear_mlocked(folio); + zone_stat_sub_folio(folio, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + } + + list_add(&folio->lru, &pages_to_free); + } + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + + mem_cgroup_uncharge_list(&pages_to_free); + free_unref_page_list(&pages_to_free); +} +EXPORT_SYMBOL(release_pages); + +/* + * The pages which we're about to release may be in the deferred lru-addition + * queues. That would prevent them from really being freed right now. That's + * OK from a correctness point of view but is inefficient - those pages may be + * cache-warm and we want to give them back to the page allocator ASAP. + * + * So __pagevec_release() will drain those queues here. + * folio_batch_move_lru() calls folios_put() directly to avoid + * mutual recursion. + */ +void __pagevec_release(struct pagevec *pvec) +{ + if (!pvec->percpu_pvec_drained) { + lru_add_drain(); + pvec->percpu_pvec_drained = true; + } + release_pages(pvec->pages, pagevec_count(pvec)); + pagevec_reinit(pvec); +} +EXPORT_SYMBOL(__pagevec_release); + +/** + * folio_batch_remove_exceptionals() - Prune non-folios from a batch. + * @fbatch: The batch to prune + * + * find_get_entries() fills a batch with both folios and shadow/swap/DAX + * entries. This function prunes all the non-folio entries from @fbatch + * without leaving holes, so that it can be passed on to folio-only batch + * operations. + */ +void folio_batch_remove_exceptionals(struct folio_batch *fbatch) +{ + unsigned int i, j; + + for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + if (!xa_is_value(folio)) + fbatch->folios[j++] = folio; + } + fbatch->nr = j; +} + +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + xa_mark_t tag) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + PAGEVEC_SIZE, pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_tag); + +/* + * Perform any setup for the swap system + */ +void __init swap_setup(void) +{ + unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + + /* Use a smaller cluster for small-memory machines */ + if (megs < 16) + page_cluster = 2; + else + page_cluster = 3; + /* + * Right now other parts of the system means that we + * _really_ don't want to cluster much more + */ +} diff --git a/mm/swap.h b/mm/swap.h new file mode 100644 index 000000000..cc08c459c --- /dev/null +++ b/mm/swap.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_H +#define _MM_SWAP_H + +#ifdef CONFIG_SWAP +#include /* for bio_end_io_t */ + +/* linux/mm/page_io.c */ +int sio_pool_init(void); +struct swap_iocb; +int swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug); +void __swap_read_unplug(struct swap_iocb *plug); +static inline void swap_read_unplug(struct swap_iocb *plug) +{ + if (unlikely(plug)) + __swap_read_unplug(plug); +} +void swap_write_unplug(struct swap_iocb *sio); +int swap_writepage(struct page *page, struct writeback_control *wbc); +int __swap_writepage(struct page *page, struct writeback_control *wbc); + +/* linux/mm/swap_state.c */ +/* One swap address space for each 64M swap space */ +#define SWAP_ADDRESS_SPACE_SHIFT 14 +#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) +extern struct address_space *swapper_spaces[]; +#define swap_address_space(entry) \ + (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ + >> SWAP_ADDRESS_SPACE_SHIFT]) + +void show_swap_cache_info(void); +bool add_to_swap(struct folio *folio); +void *get_shadow_from_swap_cache(swp_entry_t entry); +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadowp); +void __delete_from_swap_cache(struct folio *folio, + swp_entry_t entry, void *shadow); +void delete_from_swap_cache(struct folio *folio); +void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end); +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); + +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, + bool do_poll, + struct swap_iocb **plug); +struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, + bool *new_page_allocated); +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); +struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, + struct vm_fault *vmf); + +static inline unsigned int folio_swap_flags(struct folio *folio) +{ + return page_swap_info(&folio->page)->flags; +} +#else /* CONFIG_SWAP */ +struct swap_iocb; +static inline int swap_readpage(struct page *page, bool do_poll, + struct swap_iocb **plug) +{ + return 0; +} +static inline void swap_write_unplug(struct swap_iocb *sio) +{ +} + +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return NULL; +} + +static inline void show_swap_cache_info(void) +{ +} + +static inline struct page *swap_cluster_readahead(swp_entry_t entry, + gfp_t gfp_mask, struct vm_fault *vmf) +{ + return NULL; +} + +static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return NULL; +} + +static inline int swap_writepage(struct page *p, struct writeback_control *wbc) +{ + return 0; +} + +static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + return NULL; +} + +static inline +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + return find_get_page(mapping, index); +} + +static inline bool add_to_swap(struct folio *folio) +{ + return false; +} + +static inline void *get_shadow_from_swap_cache(swp_entry_t entry) +{ + return NULL; +} + +static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, + gfp_t gfp_mask, void **shadowp) +{ + return -1; +} + +static inline void __delete_from_swap_cache(struct folio *folio, + swp_entry_t entry, void *shadow) +{ +} + +static inline void delete_from_swap_cache(struct folio *folio) +{ +} + +static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end) +{ +} + +static inline unsigned int folio_swap_flags(struct folio *folio) +{ + return 0; +} +#endif /* CONFIG_SWAP */ +#endif /* _MM_SWAP_H */ diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c new file mode 100644 index 000000000..db6c4a26c --- /dev/null +++ b/mm/swap_cgroup.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include /* depends on mm.h include */ + +static DEFINE_MUTEX(swap_cgroup_mutex); +struct swap_cgroup_ctrl { + struct page **map; + unsigned long length; + spinlock_t lock; +}; + +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; + +struct swap_cgroup { + unsigned short id; +}; +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) + +/* + * SwapCgroup implements "lookup" and "exchange" operations. + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge + * against SwapCache. At swap_free(), this is accessed directly from swap. + * + * This means, + * - we have no race in "exchange" when we're accessed via SwapCache because + * SwapCache(and its swp_entry) is under lock. + * - When called via swap_free(), there is no user of this entry and no race. + * Then, we don't need lock around "exchange". + * + * TODO: we can push these buffers out to HIGHMEM. + */ + +/* + * allocate buffer for swap_cgroup. + */ +static int swap_cgroup_prepare(int type) +{ + struct page *page; + struct swap_cgroup_ctrl *ctrl; + unsigned long idx, max; + + ctrl = &swap_cgroup_ctrl[type]; + + for (idx = 0; idx < ctrl->length; idx++) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto not_enough_page; + ctrl->map[idx] = page; + + if (!(idx % SWAP_CLUSTER_MAX)) + cond_resched(); + } + return 0; +not_enough_page: + max = idx; + for (idx = 0; idx < max; idx++) + __free_page(ctrl->map[idx]); + + return -ENOMEM; +} + +static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, + pgoff_t offset) +{ + struct page *mappage; + struct swap_cgroup *sc; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + return __lookup_swap_cgroup(ctrl, offset); +} + +/** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @ent: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup using 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** + * swap_cgroup_record - record mem_cgroup for a set of swap entries + * @ent: the first swap entry to be recorded into + * @id: mem_cgroup to be recorded + * @nr_ents: number of swap entries to be recorded + * + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) + */ +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned short old; + unsigned long flags; + pgoff_t offset = swp_offset(ent); + pgoff_t end = offset + nr_ents; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + old = sc->id; + for (;;) { + VM_BUG_ON(sc->id != old); + sc->id = id; + offset++; + if (offset == end) + break; + if (offset % SC_PER_PAGE) + sc++; + else + sc = __lookup_swap_cgroup(ctrl, offset); + } + spin_unlock_irqrestore(&ctrl->lock, flags); + + return old; +} + +/** + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry + * @ent: swap entry to be looked up. + * + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + */ +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) +{ + return lookup_swap_cgroup(ent, NULL)->id; +} + +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + void *array; + unsigned long length; + struct swap_cgroup_ctrl *ctrl; + + if (mem_cgroup_disabled()) + return 0; + + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + + array = vcalloc(length, sizeof(void *)); + if (!array) + goto nomem; + + ctrl = &swap_cgroup_ctrl[type]; + mutex_lock(&swap_cgroup_mutex); + ctrl->length = length; + ctrl->map = array; + spin_lock_init(&ctrl->lock); + if (swap_cgroup_prepare(type)) { + /* memory shortage */ + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + vfree(array); + goto nomem; + } + mutex_unlock(&swap_cgroup_mutex); + + return 0; +nomem: + pr_info("couldn't allocate enough memory for swap_cgroup\n"); + pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); + return -ENOMEM; +} + +void swap_cgroup_swapoff(int type) +{ + struct page **map; + unsigned long i, length; + struct swap_cgroup_ctrl *ctrl; + + if (mem_cgroup_disabled()) + return; + + mutex_lock(&swap_cgroup_mutex); + ctrl = &swap_cgroup_ctrl[type]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; + if (page) + __free_page(page); + if (!(i % SWAP_CLUSTER_MAX)) + cond_resched(); + } + vfree(map); + } +} diff --git a/mm/swap_slots.c b/mm/swap_slots.c new file mode 100644 index 000000000..0bec1f705 --- /dev/null +++ b/mm/swap_slots.c @@ -0,0 +1,350 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Manage cache of swap slots to be used for and returned from + * swap. + * + * Copyright(c) 2016 Intel Corporation. + * + * Author: Tim Chen + * + * We allocate the swap slots from the global pool and put + * it into local per cpu caches. This has the advantage + * of no needing to acquire the swap_info lock every time + * we need a new slot. + * + * There is also opportunity to simply return the slot + * to local caches without needing to acquire swap_info + * lock. We do not reuse the returned slots directly but + * move them back to the global pool in a batch. This + * allows the slots to coalesce and reduce fragmentation. + * + * The swap entry allocated is marked with SWAP_HAS_CACHE + * flag in map_count that prevents it from being allocated + * again from the global pool. + * + * The swap slots cache is protected by a mutex instead of + * a spin lock as when we search for slots with scan_swap_map, + * we can possibly sleep. + */ + +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); +static bool swap_slot_cache_active; +bool swap_slot_cache_enabled; +static bool swap_slot_cache_initialized; +static DEFINE_MUTEX(swap_slots_cache_mutex); +/* Serialize swap slots cache enable/disable operations */ +static DEFINE_MUTEX(swap_slots_cache_enable_mutex); + +static void __drain_swap_slots_cache(unsigned int type); + +#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) +#define SLOTS_CACHE 0x1 +#define SLOTS_CACHE_RET 0x2 + +static void deactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = false; + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + mutex_unlock(&swap_slots_cache_mutex); +} + +static void reactivate_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_mutex); + swap_slot_cache_active = true; + mutex_unlock(&swap_slots_cache_mutex); +} + +/* Must not be called with cpu hot plug lock */ +void disable_swap_slots_cache_lock(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + swap_slot_cache_enabled = false; + if (swap_slot_cache_initialized) { + /* serialize with cpu hotplug operations */ + cpus_read_lock(); + __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + cpus_read_unlock(); + } +} + +static void __reenable_swap_slots_cache(void) +{ + swap_slot_cache_enabled = has_usable_swap(); +} + +void reenable_swap_slots_cache_unlock(void) +{ + __reenable_swap_slots_cache(); + mutex_unlock(&swap_slots_cache_enable_mutex); +} + +static bool check_cache_active(void) +{ + long pages; + + if (!swap_slot_cache_enabled) + return false; + + pages = get_nr_swap_pages(); + if (!swap_slot_cache_active) { + if (pages > num_online_cpus() * + THRESHOLD_ACTIVATE_SWAP_SLOTS_CACHE) + reactivate_swap_slots_cache(); + goto out; + } + + /* if global pool of slot caches too low, deactivate cache */ + if (pages < num_online_cpus() * THRESHOLD_DEACTIVATE_SWAP_SLOTS_CACHE) + deactivate_swap_slots_cache(); +out: + return swap_slot_cache_active; +} + +static int alloc_swap_slot_cache(unsigned int cpu) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots, *slots_ret; + + /* + * Do allocation outside swap_slots_cache_mutex + * as kvzalloc could trigger reclaim and folio_alloc_swap, + * which can lock swap_slots_cache_mutex. + */ + slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), + GFP_KERNEL); + if (!slots) + return -ENOMEM; + + slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), + GFP_KERNEL); + if (!slots_ret) { + kvfree(slots); + return -ENOMEM; + } + + mutex_lock(&swap_slots_cache_mutex); + cache = &per_cpu(swp_slots, cpu); + if (cache->slots || cache->slots_ret) { + /* cache already allocated */ + mutex_unlock(&swap_slots_cache_mutex); + + kvfree(slots); + kvfree(slots_ret); + + return 0; + } + + if (!cache->lock_initialized) { + mutex_init(&cache->alloc_lock); + spin_lock_init(&cache->free_lock); + cache->lock_initialized = true; + } + cache->nr = 0; + cache->cur = 0; + cache->n_ret = 0; + /* + * We initialized alloc_lock and free_lock earlier. We use + * !cache->slots or !cache->slots_ret to know if it is safe to acquire + * the corresponding lock and use the cache. Memory barrier below + * ensures the assumption. + */ + mb(); + cache->slots = slots; + cache->slots_ret = slots_ret; + mutex_unlock(&swap_slots_cache_mutex); + return 0; +} + +static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, + bool free_slots) +{ + struct swap_slots_cache *cache; + swp_entry_t *slots = NULL; + + cache = &per_cpu(swp_slots, cpu); + if ((type & SLOTS_CACHE) && cache->slots) { + mutex_lock(&cache->alloc_lock); + swapcache_free_entries(cache->slots + cache->cur, cache->nr); + cache->cur = 0; + cache->nr = 0; + if (free_slots && cache->slots) { + kvfree(cache->slots); + cache->slots = NULL; + } + mutex_unlock(&cache->alloc_lock); + } + if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { + spin_lock_irq(&cache->free_lock); + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + if (free_slots && cache->slots_ret) { + slots = cache->slots_ret; + cache->slots_ret = NULL; + } + spin_unlock_irq(&cache->free_lock); + kvfree(slots); + } +} + +static void __drain_swap_slots_cache(unsigned int type) +{ + unsigned int cpu; + + /* + * This function is called during + * 1) swapoff, when we have to make sure no + * left over slots are in cache when we remove + * a swap device; + * 2) disabling of swap slot cache, when we run low + * on swap slots when allocating memory and need + * to return swap slots to global pool. + * + * We cannot acquire cpu hot plug lock here as + * this function can be invoked in the cpu + * hot plug path: + * cpu_up -> lock cpu_hotplug -> cpu hotplug state callback + * -> memory allocation -> direct reclaim -> folio_alloc_swap + * -> drain_swap_slots_cache + * + * Hence the loop over current online cpu below could miss cpu that + * is being brought online but not yet marked as online. + * That is okay as we do not schedule and run anything on a + * cpu before it has been marked online. Hence, we will not + * fill any swap slots in slots cache of such cpu. + * There are no slots on such cpu that need to be drained. + */ + for_each_online_cpu(cpu) + drain_slots_cache_cpu(cpu, type, false); +} + +static int free_slot_cache(unsigned int cpu) +{ + mutex_lock(&swap_slots_cache_mutex); + drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + mutex_unlock(&swap_slots_cache_mutex); + return 0; +} + +void enable_swap_slots_cache(void) +{ + mutex_lock(&swap_slots_cache_enable_mutex); + if (!swap_slot_cache_initialized) { + int ret; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", + alloc_swap_slot_cache, free_slot_cache); + if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " + "without swap slots cache.\n", __func__)) + goto out_unlock; + + swap_slot_cache_initialized = true; + } + + __reenable_swap_slots_cache(); +out_unlock: + mutex_unlock(&swap_slots_cache_enable_mutex); +} + +/* called with swap slot cache's alloc lock held */ +static int refill_swap_slots_cache(struct swap_slots_cache *cache) +{ + if (!use_swap_slot_cache) + return 0; + + cache->cur = 0; + if (swap_slot_cache_active) + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, + cache->slots, 1); + + return cache->nr; +} + +void free_swap_slot(swp_entry_t entry) +{ + struct swap_slots_cache *cache; + + cache = raw_cpu_ptr(&swp_slots); + if (likely(use_swap_slot_cache && cache->slots_ret)) { + spin_lock_irq(&cache->free_lock); + /* Swap slots cache may be deactivated before acquiring lock */ + if (!use_swap_slot_cache || !cache->slots_ret) { + spin_unlock_irq(&cache->free_lock); + goto direct_free; + } + if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { + /* + * Return slots to global pool. + * The current swap_map value is SWAP_HAS_CACHE. + * Set it to 0 to indicate it is available for + * allocation in global pool + */ + swapcache_free_entries(cache->slots_ret, cache->n_ret); + cache->n_ret = 0; + } + cache->slots_ret[cache->n_ret++] = entry; + spin_unlock_irq(&cache->free_lock); + } else { +direct_free: + swapcache_free_entries(&entry, 1); + } +} + +swp_entry_t folio_alloc_swap(struct folio *folio) +{ + swp_entry_t entry; + struct swap_slots_cache *cache; + + entry.val = 0; + + if (folio_test_large(folio)) { + if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported()) + get_swap_pages(1, &entry, folio_nr_pages(folio)); + goto out; + } + + /* + * Preemption is allowed here, because we may sleep + * in refill_swap_slots_cache(). But it is safe, because + * accesses to the per-CPU data structure are protected by the + * mutex cache->alloc_lock. + * + * The alloc path here does not touch cache->slots_ret + * so cache->free_lock is not taken. + */ + cache = raw_cpu_ptr(&swp_slots); + + if (likely(check_cache_active() && cache->slots)) { + mutex_lock(&cache->alloc_lock); + if (cache->slots) { +repeat: + if (cache->nr) { + entry = cache->slots[cache->cur]; + cache->slots[cache->cur++].val = 0; + cache->nr--; + } else if (refill_swap_slots_cache(cache)) { + goto repeat; + } + } + mutex_unlock(&cache->alloc_lock); + if (entry.val) + goto out; + } + + get_swap_pages(1, &entry, 1); +out: + if (mem_cgroup_try_charge_swap(folio, entry)) { + put_swap_folio(folio, entry); + entry.val = 0; + } + return entry; +} diff --git a/mm/swap_state.c b/mm/swap_state.c new file mode 100644 index 000000000..438d0676c --- /dev/null +++ b/mm/swap_state.c @@ -0,0 +1,910 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/mm/swap_state.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + * + * Rewritten to use page cache, (C) 1998 Stephen Tweedie + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "swap.h" + +/* + * swapper_space is a fiction, retained to simplify the path through + * vmscan's shrink_page_list. + */ +static const struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .dirty_folio = noop_dirty_folio, +#ifdef CONFIG_MIGRATION + .migrate_folio = migrate_folio, +#endif +}; + +struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +static bool enable_vma_readahead __read_mostly = true; + +#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) +#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) +#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK +#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) + +#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) +#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) +#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) + +#define SWAP_RA_VAL(addr, win, hits) \ + (((addr) & PAGE_MASK) | \ + (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ + ((hits) & SWAP_RA_HITS_MASK)) + +/* Initial readahead hits is 4 to start up with a small window */ +#define GET_SWAP_RA_VAL(vma) \ + (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) + +static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); + +void show_swap_cache_info(void) +{ + printk("%lu pages in swap cache\n", total_swapcache_pages()); + printk("Free swap = %ldkB\n", + get_nr_swap_pages() << (PAGE_SHIFT - 10)); + printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); +} + +void *get_shadow_from_swap_cache(swp_entry_t entry) +{ + struct address_space *address_space = swap_address_space(entry); + pgoff_t idx = swp_offset(entry); + struct page *page; + + page = xa_load(&address_space->i_pages, idx); + if (xa_is_value(page)) + return page; + return NULL; +} + +/* + * add_to_swap_cache resembles filemap_add_folio on swapper_space, + * but sets SwapCache flag and private instead of mapping and index. + */ +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadowp) +{ + struct address_space *address_space = swap_address_space(entry); + pgoff_t idx = swp_offset(entry); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); + unsigned long i, nr = folio_nr_pages(folio); + void *old; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + + folio_ref_add(folio, nr); + folio_set_swapcache(folio); + + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < nr; i++) { + VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); + old = xas_load(&xas); + if (xa_is_value(old)) { + if (shadowp) + *shadowp = old; + } + set_page_private(folio_page(folio, i), entry.val + i); + xas_store(&xas, folio); + xas_next(&xas); + } + address_space->nrpages += nr; + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (!xas_error(&xas)) + return 0; + + folio_clear_swapcache(folio); + folio_ref_sub(folio, nr); + return xas_error(&xas); +} + +/* + * This must be called only on folios that have + * been verified to be in the swap cache. + */ +void __delete_from_swap_cache(struct folio *folio, + swp_entry_t entry, void *shadow) +{ + struct address_space *address_space = swap_address_space(entry); + int i; + long nr = folio_nr_pages(folio); + pgoff_t idx = swp_offset(entry); + XA_STATE(xas, &address_space->i_pages, idx); + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + + for (i = 0; i < nr; i++) { + void *entry = xas_store(&xas, shadow); + VM_BUG_ON_PAGE(entry != folio, entry); + set_page_private(folio_page(folio, i), 0); + xas_next(&xas); + } + folio_clear_swapcache(folio); + address_space->nrpages -= nr; + __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); +} + +/** + * add_to_swap - allocate swap space for a folio + * @folio: folio we want to move to swap + * + * Allocate swap space for the folio and add the folio to the + * swap cache. + * + * Context: Caller needs to hold the folio lock. + * Return: Whether the folio was added to the swap cache. + */ +bool add_to_swap(struct folio *folio) +{ + swp_entry_t entry; + int err; + + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio); + + entry = folio_alloc_swap(folio); + if (!entry.val) + return false; + + /* + * XArray node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache. + */ + err = add_to_swap_cache(folio, entry, + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); + if (err) + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + goto fail; + /* + * Normally the folio will be dirtied in unmap because its + * pte should be dirty. A special case is MADV_FREE page. The + * page's pte could have dirty bit cleared but the folio's + * SwapBacked flag is still set because clearing the dirty bit + * and SwapBacked flag has no lock protected. For such folio, + * unmap will not set dirty bit for it, so folio reclaim will + * not write the folio out. This can cause data corruption when + * the folio is swapped in later. Always setting the dirty flag + * for the folio solves the problem. + */ + folio_mark_dirty(folio); + + return true; + +fail: + put_swap_folio(folio, entry); + return false; +} + +/* + * This must be called only on folios that have + * been verified to be in the swap cache and locked. + * It will never put the folio into the free list, + * the caller has a reference on the folio. + */ +void delete_from_swap_cache(struct folio *folio) +{ + swp_entry_t entry = folio_swap_entry(folio); + struct address_space *address_space = swap_address_space(entry); + + xa_lock_irq(&address_space->i_pages); + __delete_from_swap_cache(folio, entry, NULL); + xa_unlock_irq(&address_space->i_pages); + + put_swap_folio(folio, entry); + folio_ref_sub(folio, folio_nr_pages(folio)); +} + +void clear_shadow_from_swap_cache(int type, unsigned long begin, + unsigned long end) +{ + unsigned long curr = begin; + void *old; + + for (;;) { + swp_entry_t entry = swp_entry(type, curr); + struct address_space *address_space = swap_address_space(entry); + XA_STATE(xas, &address_space->i_pages, curr); + + xa_lock_irq(&address_space->i_pages); + xas_for_each(&xas, old, end) { + if (!xa_is_value(old)) + continue; + xas_store(&xas, NULL); + } + xa_unlock_irq(&address_space->i_pages); + + /* search the next swapcache until we meet end */ + curr >>= SWAP_ADDRESS_SPACE_SHIFT; + curr++; + curr <<= SWAP_ADDRESS_SPACE_SHIFT; + if (curr > end) + break; + } +} + +/* + * If we are the only user, then try to free up the swap cache. + * + * Its ok to check the swapcache flag without the folio lock + * here because we are going to recheck again inside + * folio_free_swap() _with_ the lock. + * - Marcelo + */ +void free_swap_cache(struct page *page) +{ + struct folio *folio = page_folio(page); + + if (folio_test_swapcache(folio) && !folio_mapped(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); + } +} + +/* + * Perform a free_page(), also freeing any swap cache associated with + * this page if it is the last user of the page. + */ +void free_page_and_swap_cache(struct page *page) +{ + free_swap_cache(page); + if (!is_huge_zero_page(page)) + put_page(page); +} + +/* + * Passed an array of pages, drop them all from swapcache and then release + * them. They are removed from the LRU and freed if this is their last use. + */ +void free_pages_and_swap_cache(struct page **pages, int nr) +{ + struct page **pagep = pages; + int i; + + lru_add_drain(); + for (i = 0; i < nr; i++) + free_swap_cache(pagep[i]); + release_pages(pagep, nr); +} + +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); +} + +/* + * Lookup a swap entry in the swap cache. A found folio will be returned + * unlocked and with its refcount incremented - we rely on the kernel + * lock getting page table operations atomic even if we drop the folio + * lock before returning. + */ +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + struct folio *folio; + struct swap_info_struct *si; + + si = get_swap_device(entry); + if (!si) + return NULL; + folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); + + if (folio) { + bool vma_ra = swap_use_vma_readahead(); + bool readahead; + + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ + if (unlikely(folio_test_large(folio))) + return folio; + + readahead = folio_test_clear_readahead(folio); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); + } + + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma || !vma_ra) + atomic_inc(&swapin_readahead_hits); + } + } + + return folio; +} + +/** + * find_get_incore_page - Find and get a page from the page or swap caches. + * @mapping: The address_space to search. + * @index: The page cache index. + * + * This differs from find_get_page() in that it will also look for the + * page in the swap cache. + * + * Return: The found page or %NULL. + */ +struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +{ + swp_entry_t swp; + struct swap_info_struct *si; + struct page *page = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + if (!page) + return page; + if (!xa_is_value(page)) + return find_subpage(page, index); + if (!shmem_mapping(mapping)) + return NULL; + + swp = radix_to_swp_entry(page); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return NULL; + /* Prevent swapoff from happening to us */ + si = get_swap_device(swp); + if (!si) + return NULL; + page = find_get_page(swap_address_space(swp), swp_offset(swp)); + put_swap_device(si); + return page; +} + +struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr, + bool *new_page_allocated) +{ + struct swap_info_struct *si; + struct folio *folio; + void *shadow = NULL; + + *new_page_allocated = false; + + for (;;) { + int err; + /* + * First check the swap cache. Since this is normally + * called after swap_cache_get_folio() failed, re-calling + * that would confuse statistics. + */ + si = get_swap_device(entry); + if (!si) + return NULL; + folio = filemap_get_folio(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); + if (folio) + return folio_file_page(folio, swp_offset(entry)); + + /* + * Just skip read ahead for unused swap slot. + * During swap_off when swap_slot_cache is disabled, + * we have to handle the race between putting + * swap entry in swap cache and marking swap slot + * as SWAP_HAS_CACHE. That's done in later part of code or + * else swap_off will be aborted if we return NULL. + */ + if (!__swp_swapcount(entry) && swap_slot_cache_enabled) + return NULL; + + /* + * Get a new page to read into from swap. Allocate it now, + * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will + * cause any racers to loop around until we add it to cache. + */ + folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + if (!folio) + return NULL; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (!err) + break; + + folio_put(folio); + if (err != -EEXIST) + return NULL; + + /* + * We might race against __delete_from_swap_cache(), and + * stumble across a swap_map entry whose SWAP_HAS_CACHE + * has not yet been cleared. Or race against another + * __read_swap_cache_async(), which has set SWAP_HAS_CACHE + * in swap_map, but not yet added its page to swap cache. + */ + schedule_timeout_uninterruptible(1); + } + + /* + * The swap entry is ours to swap in. Prepare the new page. + */ + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) + goto fail_unlock; + + /* May fail (-ENOMEM) if XArray node allocation failed. */ + if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + goto fail_unlock; + + mem_cgroup_swapin_uncharge_swap(entry); + + if (shadow) + workingset_refault(folio, shadow); + + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); + *new_page_allocated = true; + return &folio->page; + +fail_unlock: + put_swap_folio(folio, entry); + folio_unlock(folio); + folio_put(folio); + return NULL; +} + +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, + unsigned long addr, bool do_poll, + struct swap_iocb **plug) +{ + bool page_was_allocated; + struct page *retpage = __read_swap_cache_async(entry, gfp_mask, + vma, addr, &page_was_allocated); + + if (page_was_allocated) + swap_readpage(retpage, do_poll, plug); + + return retpage; +} + +static unsigned int __swapin_nr_pages(unsigned long prev_offset, + unsigned long offset, + int hits, + int max_pages, + int prev_win) +{ + unsigned int pages, last_ra; + + /* + * This heuristic has been found to work well on both sequential and + * random loads, swapping to hard disk or to SSD: please don't ask + * what the "+ 2" means, it just happens to work well, that's all. + */ + pages = hits + 2; + if (pages == 2) { + /* + * We can have no readahead hits to judge by: but must not get + * stuck here forever, so check for an adjacent offset instead + * (and don't even bother to check whether swap type is same). + */ + if (offset != prev_offset + 1 && offset != prev_offset - 1) + pages = 1; + } else { + unsigned int roundup = 4; + while (roundup < pages) + roundup <<= 1; + pages = roundup; + } + + if (pages > max_pages) + pages = max_pages; + + /* Don't shrink readahead too fast */ + last_ra = prev_win / 2; + if (pages < last_ra) + pages = last_ra; + + return pages; +} + +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int hits, pages, max_pages; + static atomic_t last_readahead_pages; + + max_pages = 1 << READ_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + hits = atomic_xchg(&swapin_readahead_hits, 0); + pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, + max_pages, + atomic_read(&last_readahead_pages)); + if (!hits) + WRITE_ONCE(prev_offset, offset); + atomic_set(&last_readahead_pages, pages); + + return pages; +} + +/** + * swap_cluster_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + * + * This has been extended to use the NUMA policies from the mm triggering + * the readahead. + * + * Caller must hold read mmap_lock if vmf->vma is not NULL. + */ +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + struct page *page; + unsigned long entry_offset = swp_offset(entry); + unsigned long offset = entry_offset; + unsigned long start_offset, end_offset; + unsigned long mask; + struct swap_info_struct *si = swp_swap_info(entry); + struct blk_plug plug; + struct swap_iocb *splug = NULL; + bool do_poll = true, page_allocated; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; + + mask = swapin_nr_pages(offset) - 1; + if (!mask) + goto skip; + + do_poll = false; + /* Read a page_cluster sized and aligned cluster around offset. */ + start_offset = offset & ~mask; + end_offset = offset | mask; + if (!start_offset) /* First page is swap header. */ + start_offset++; + if (end_offset >= si->max) + end_offset = si->max - 1; + + blk_start_plug(&plug); + for (offset = start_offset; offset <= end_offset ; offset++) { + /* Ok, do the async read-ahead now */ + page = __read_swap_cache_async( + swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr, &page_allocated); + if (!page) + continue; + if (page_allocated) { + swap_readpage(page, false, &splug); + if (offset != entry_offset) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } + put_page(page); + } + blk_finish_plug(&plug); + swap_read_unplug(splug); + + lru_add_drain(); /* Push any new pages onto the LRU now */ +skip: + /* The page was likely read above, so no need for plugging here */ + return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); +} + +int init_swap_address_space(unsigned int type, unsigned long nr_pages) +{ + struct address_space *spaces, *space; + unsigned int i, nr; + + nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); + if (!spaces) + return -ENOMEM; + for (i = 0; i < nr; i++) { + space = spaces + i; + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); + atomic_set(&space->i_mmap_writable, 0); + space->a_ops = &swap_aops; + /* swap cache doesn't use writeback related tags */ + mapping_set_no_writeback_tags(space); + } + nr_swapper_spaces[type] = nr; + swapper_spaces[type] = spaces; + + return 0; +} + +void exit_swap_address_space(unsigned int type) +{ + int i; + struct address_space *spaces = swapper_spaces[type]; + + for (i = 0; i < nr_swapper_spaces[type]; i++) + VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); + kvfree(spaces); + nr_swapper_spaces[type] = 0; + swapper_spaces[type] = NULL; +} + +static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, + unsigned long faddr, + unsigned long lpfn, + unsigned long rpfn, + unsigned long *start, + unsigned long *end) +{ + *start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + *end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); +} + +static void swap_ra_info(struct vm_fault *vmf, + struct vma_swap_readahead *ra_info) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long ra_val; + unsigned long faddr, pfn, fpfn; + unsigned long start, end; + pte_t *pte, *orig_pte; + unsigned int max_win, hits, prev_win, win, left; +#ifndef CONFIG_64BIT + pte_t *tpte; +#endif + + max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), + SWAP_RA_ORDER_CEILING); + if (max_win == 1) { + ra_info->win = 1; + return; + } + + faddr = vmf->address; + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); + + fpfn = PFN_DOWN(faddr); + ra_val = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); + prev_win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(faddr, win, 0)); + + if (win == 1) { + pte_unmap(orig_pte); + return; + } + + /* Copy the PTEs because the page table may be unmapped */ + if (fpfn == pfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); + else if (pfn == fpfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, + &start, &end); + else { + left = (win - 1) / 2; + swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, + &start, &end); + } + ra_info->nr_pte = end - start; + ra_info->offset = fpfn - start; + pte -= ra_info->offset; +#ifdef CONFIG_64BIT + ra_info->ptes = pte; +#else + tpte = ra_info->ptes; + for (pfn = start; pfn != end; pfn++) + *tpte++ = *pte++; +#endif + pte_unmap(orig_pte); +} + +/** + * swap_vma_readahead - swap in pages in hope we need them soon + * @fentry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * Primitive swap readahead code. We simply read in a few pages whose + * virtual addresses are around the fault address in the same vma. + * + * Caller must hold read mmap_lock if vmf->vma is not NULL. + * + */ +static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + struct blk_plug plug; + struct swap_iocb *splug = NULL; + struct vm_area_struct *vma = vmf->vma; + struct page *page; + pte_t *pte, pentry; + swp_entry_t entry; + unsigned int i; + bool page_allocated; + struct vma_swap_readahead ra_info = { + .win = 1, + }; + + swap_ra_info(vmf, &ra_info); + if (ra_info.win == 1) + goto skip; + + blk_start_plug(&plug); + for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; + i++, pte++) { + pentry = *pte; + if (!is_swap_pte(pentry)) + continue; + entry = pte_to_swp_entry(pentry); + if (unlikely(non_swap_entry(entry))) + continue; + page = __read_swap_cache_async(entry, gfp_mask, vma, + vmf->address, &page_allocated); + if (!page) + continue; + if (page_allocated) { + swap_readpage(page, false, &splug); + if (i != ra_info.offset) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } + put_page(page); + } + blk_finish_plug(&plug); + swap_read_unplug(splug); + lru_add_drain(); +skip: + /* The page was likely read above, so no need for plugging here */ + return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, + ra_info.win == 1, NULL); +} + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * It's a main entry function for swap readahead. By the configuration, + * it will read ahead blocks by cluster-based(ie, physical disk based) + * or vma-based(ie, virtual address based on faulty address) readahead. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, vmf) : + swap_cluster_readahead(entry, gfp_mask, vmf); +} + +#ifdef CONFIG_SYSFS +static ssize_t vma_ra_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", + enable_vma_readahead ? "true" : "false"); +} +static ssize_t vma_ra_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &enable_vma_readahead); + if (ret) + return ret; + + return count; +} +static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled); + +static struct attribute *swap_attrs[] = { + &vma_ra_enabled_attr.attr, + NULL, +}; + +static const struct attribute_group swap_attr_group = { + .attrs = swap_attrs, +}; + +static int __init swap_init_sysfs(void) +{ + int err; + struct kobject *swap_kobj; + + swap_kobj = kobject_create_and_add("swap", mm_kobj); + if (!swap_kobj) { + pr_err("failed to create swap kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(swap_kobj, &swap_attr_group); + if (err) { + pr_err("failed to register swap group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(swap_kobj); + return err; +} +subsys_initcall(swap_init_sysfs); +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c new file mode 100644 index 000000000..71db6d8a1 --- /dev/null +++ b/mm/swapfile.c @@ -0,0 +1,3686 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/swapfile.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * Swap reorganised 29.12.95, Stephen Tweedie + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "swap.h" + +static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +static void free_swap_count_continuations(struct swap_info_struct *); + +static DEFINE_SPINLOCK(swap_lock); +static unsigned int nr_swapfiles; +atomic_long_t nr_swap_pages; +/* + * Some modules use swappable objects and may try to swap them out under + * memory pressure (via the shrinker). Before doing so, they may wish to + * check to see if any swap space is available. + */ +EXPORT_SYMBOL_GPL(nr_swap_pages); +/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ +long total_swap_pages; +static int least_priority = -1; +unsigned long swapfile_maximum_size; +#ifdef CONFIG_MIGRATION +bool swap_migration_ad_supported; +#endif /* CONFIG_MIGRATION */ + +static const char Bad_file[] = "Bad swap file entry "; +static const char Unused_file[] = "Unused swap file entry "; +static const char Bad_offset[] = "Bad swap offset entry "; +static const char Unused_offset[] = "Unused swap offset entry "; + +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +static PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by folio_alloc_swap() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but folio_alloc_swap() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static struct plist_head *swap_avail_heads; +static DEFINE_SPINLOCK(swap_avail_lock); + +struct swap_info_struct *swap_info[MAX_SWAPFILES]; + +static DEFINE_MUTEX(swapon_mutex); + +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); + +atomic_t nr_rotate_swap = ATOMIC_INIT(0); + +static struct swap_info_struct *swap_type_to_swap_info(int type) +{ + if (type >= MAX_SWAPFILES) + return NULL; + + return READ_ONCE(swap_info[type]); /* rcu_dereference() */ +} + +static inline unsigned char swap_count(unsigned char ent) +{ + return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ +} + +/* Reclaim the swap entry anyway if possible */ +#define TTRS_ANYWAY 0x1 +/* + * Reclaim the swap entry if there are no more mappings of the + * corresponding page + */ +#define TTRS_UNMAPPED 0x2 +/* Reclaim the swap entry if swap is getting full*/ +#define TTRS_FULL 0x4 + +/* returns 1 if swap entry is freed */ +static int __try_to_reclaim_swap(struct swap_info_struct *si, + unsigned long offset, unsigned long flags) +{ + swp_entry_t entry = swp_entry(si->type, offset); + struct folio *folio; + int ret = 0; + + folio = filemap_get_folio(swap_address_space(entry), offset); + if (!folio) + return 0; + /* + * When this function is called from scan_swap_map_slots() and it's + * called by vmscan.c at reclaiming folios. So we hold a folio lock + * here. We have to use trylock for avoiding deadlock. This is a special + * case and you should use folio_free_swap() with explicit folio_lock() + * in usual operations. + */ + if (folio_trylock(folio)) { + if ((flags & TTRS_ANYWAY) || + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) + ret = folio_free_swap(folio); + folio_unlock(folio); + } + folio_put(folio); + return ret; +} + +static inline struct swap_extent *first_se(struct swap_info_struct *sis) +{ + struct rb_node *rb = rb_first(&sis->swap_extent_root); + return rb_entry(rb, struct swap_extent, rb_node); +} + +static inline struct swap_extent *next_se(struct swap_extent *se) +{ + struct rb_node *rb = rb_next(&se->rb_node); + return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL; +} + +/* + * swapon tell device that all the old swap contents can be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static int discard_swap(struct swap_info_struct *si) +{ + struct swap_extent *se; + sector_t start_block; + sector_t nr_blocks; + int err = 0; + + /* Do not discard the swap header page! */ + se = first_se(si); + start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); + nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); + if (nr_blocks) { + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL); + if (err) + return err; + cond_resched(); + } + + for (se = next_se(se); se; se = next_se(se)) { + start_block = se->start_block << (PAGE_SHIFT - 9); + nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL); + if (err) + break; + + cond_resched(); + } + return err; /* That will often be -EOPNOTSUPP */ +} + +static struct swap_extent * +offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) +{ + struct swap_extent *se; + struct rb_node *rb; + + rb = sis->swap_extent_root.rb_node; + while (rb) { + se = rb_entry(rb, struct swap_extent, rb_node); + if (offset < se->start_page) + rb = rb->rb_left; + else if (offset >= se->start_page + se->nr_pages) + rb = rb->rb_right; + else + return se; + } + /* It *must* be present */ + BUG(); +} + +sector_t swap_page_sector(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + struct swap_extent *se; + sector_t sector; + pgoff_t offset; + + offset = __page_file_index(page); + se = offset_to_swap_extent(sis, offset); + sector = se->start_block + (offset - se->start_page); + return sector << (PAGE_SHIFT - 9); +} + +/* + * swap allocation tell device that a cluster of swap can now be discarded, + * to allow the swap device to optimize its wear-levelling. + */ +static void discard_swap_cluster(struct swap_info_struct *si, + pgoff_t start_page, pgoff_t nr_pages) +{ + struct swap_extent *se = offset_to_swap_extent(si, start_page); + + while (nr_pages) { + pgoff_t offset = start_page - se->start_page; + sector_t start_block = se->start_block + offset; + sector_t nr_blocks = se->nr_pages - offset; + + if (nr_blocks > nr_pages) + nr_blocks = nr_pages; + start_page += nr_blocks; + nr_pages -= nr_blocks; + + start_block <<= PAGE_SHIFT - 9; + nr_blocks <<= PAGE_SHIFT - 9; + if (blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_NOIO)) + break; + + se = next_se(se); + } +} + +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR + +#define swap_entry_size(size) (size) +#else +#define SWAPFILE_CLUSTER 256 + +/* + * Define swap_entry_size() as constant to let compiler to optimize + * out some code if !CONFIG_THP_SWAP + */ +#define swap_entry_size(size) 1 +#endif +#define LATENCY_LIMIT 256 + +static inline void cluster_set_flag(struct swap_cluster_info *info, + unsigned int flag) +{ + info->flags = flag; +} + +static inline unsigned int cluster_count(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_count(struct swap_cluster_info *info, + unsigned int c) +{ + info->data = c; +} + +static inline void cluster_set_count_flag(struct swap_cluster_info *info, + unsigned int c, unsigned int f) +{ + info->flags = f; + info->data = c; +} + +static inline unsigned int cluster_next(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_next(struct swap_cluster_info *info, + unsigned int n) +{ + info->data = n; +} + +static inline void cluster_set_next_flag(struct swap_cluster_info *info, + unsigned int n, unsigned int f) +{ + info->flags = f; + info->data = n; +} + +static inline bool cluster_is_free(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_FREE; +} + +static inline bool cluster_is_null(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_NEXT_NULL; +} + +static inline void cluster_set_null(struct swap_cluster_info *info) +{ + info->flags = CLUSTER_FLAG_NEXT_NULL; + info->data = 0; +} + +static inline bool cluster_is_huge(struct swap_cluster_info *info) +{ + if (IS_ENABLED(CONFIG_THP_SWAP)) + return info->flags & CLUSTER_FLAG_HUGE; + return false; +} + +static inline void cluster_clear_huge(struct swap_cluster_info *info) +{ + info->flags &= ~CLUSTER_FLAG_HUGE; +} + +static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + struct swap_cluster_info *ci; + + ci = si->cluster_info; + if (ci) { + ci += offset / SWAPFILE_CLUSTER; + spin_lock(&ci->lock); + } + return ci; +} + +static inline void unlock_cluster(struct swap_cluster_info *ci) +{ + if (ci) + spin_unlock(&ci->lock); +} + +/* + * Determine the locking method in use for this device. Return + * swap_cluster_info if SSD-style cluster-based locking is in place. + */ +static inline struct swap_cluster_info *lock_cluster_or_swap_info( + struct swap_info_struct *si, unsigned long offset) +{ + struct swap_cluster_info *ci; + + /* Try to use fine-grained SSD-style locking if available: */ + ci = lock_cluster(si, offset); + /* Otherwise, fall back to traditional, coarse locking: */ + if (!ci) + spin_lock(&si->lock); + + return ci; +} + +static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + if (ci) + unlock_cluster(ci); + else + spin_unlock(&si->lock); +} + +static inline bool cluster_list_empty(struct swap_cluster_list *list) +{ + return cluster_is_null(&list->head); +} + +static inline unsigned int cluster_list_first(struct swap_cluster_list *list) +{ + return cluster_next(&list->head); +} + +static void cluster_list_init(struct swap_cluster_list *list) +{ + cluster_set_null(&list->head); + cluster_set_null(&list->tail); +} + +static void cluster_list_add_tail(struct swap_cluster_list *list, + struct swap_cluster_info *ci, + unsigned int idx) +{ + if (cluster_list_empty(list)) { + cluster_set_next_flag(&list->head, idx, 0); + cluster_set_next_flag(&list->tail, idx, 0); + } else { + struct swap_cluster_info *ci_tail; + unsigned int tail = cluster_next(&list->tail); + + /* + * Nested cluster lock, but both cluster locks are + * only acquired when we held swap_info_struct->lock + */ + ci_tail = ci + tail; + spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); + cluster_set_next(ci_tail, idx); + spin_unlock(&ci_tail->lock); + cluster_set_next_flag(&list->tail, idx, 0); + } +} + +static unsigned int cluster_list_del_first(struct swap_cluster_list *list, + struct swap_cluster_info *ci) +{ + unsigned int idx; + + idx = cluster_next(&list->head); + if (cluster_next(&list->tail) == idx) { + cluster_set_null(&list->head); + cluster_set_null(&list->tail); + } else + cluster_set_next_flag(&list->head, + cluster_next(&ci[idx]), 0); + + return idx; +} + +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map_slots() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map_slots(), mark the swap entries bad (occupied). + * It will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx); + + schedule_work(&si->discard_work); +} + +static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); + cluster_list_add_tail(&si->free_clusters, ci, idx); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info, *ci; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_list_empty(&si->discard_clusters)) { + idx = cluster_list_del_first(&si->discard_clusters, info); + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + __free_cluster(si, idx); + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + +static void swap_users_ref_free(struct percpu_ref *ref) +{ + struct swap_info_struct *si; + + si = container_of(ref, struct swap_info_struct, users); + complete(&si->comp); +} + +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); + cluster_list_del_first(&si->free_clusters, ci); + cluster_set_count_flag(ci + idx, 0, 0); +} + +static void free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info + idx; + + VM_BUG_ON(cluster_count(ci) != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(si, idx); + return; + } + + __free_cluster(si, idx); +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + if (cluster_is_free(&cluster_info[idx])) + alloc_cluster(p, idx); + + VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) + 1); +} + +/* + * The cluster corresponding to page_nr decreases one usage. If the usage + * counter becomes 0, which means no page in the cluster is in using, we can + * optionally discard the cluster and add it to free cluster list. + */ +static void dec_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + + VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) - 1); + + if (cluster_count(&cluster_info[idx]) == 0) + free_cluster(p, idx); +} + +/* + * It's possible scan_swap_map_slots() uses a free cluster in the middle of free + * cluster list. Avoiding such abuse to avoid list corruption. + */ +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, + unsigned long offset) +{ + struct percpu_cluster *percpu_cluster; + bool conflict; + + offset /= SWAPFILE_CLUSTER; + conflict = !cluster_list_empty(&si->free_clusters) && + offset != cluster_list_first(&si->free_clusters) && + cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + struct swap_cluster_info *ci; + unsigned long tmp, max; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_list_empty(&si->free_clusters)) { + cluster->index = si->free_clusters.head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_list_empty(&si->discard_clusters)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock + */ + swap_do_scheduled_discard(si); + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; + goto new_cluster; + } else + return false; + } + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + max = min_t(unsigned long, si->max, + (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; + } + unlock_cluster(ci); + } + if (tmp >= max) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; + return true; +} + +static void __del_from_avail_list(struct swap_info_struct *p) +{ + int nid; + + assert_spin_locked(&p->lock); + for_each_node(nid) + plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); +} + +static void del_from_avail_list(struct swap_info_struct *p) +{ + spin_lock(&swap_avail_lock); + __del_from_avail_list(p); + spin_unlock(&swap_avail_lock); +} + +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned int end = offset + nr_entries - 1; + + if (offset == si->lowest_bit) + si->lowest_bit += nr_entries; + if (end == si->highest_bit) + WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); + WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + del_from_avail_list(si); + } +} + +static void add_to_avail_list(struct swap_info_struct *p) +{ + int nid; + + spin_lock(&swap_avail_lock); + for_each_node(nid) { + WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); + } + spin_unlock(&swap_avail_lock); +} + +static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned long begin = offset; + unsigned long end = offset + nr_entries - 1; + void (*swap_slot_free_notify)(struct block_device *, unsigned long); + + if (offset < si->lowest_bit) + si->lowest_bit = offset; + if (end > si->highest_bit) { + bool was_full = !si->highest_bit; + + WRITE_ONCE(si->highest_bit, end); + if (was_full && (si->flags & SWP_WRITEOK)) + add_to_avail_list(si); + } + atomic_long_add(nr_entries, &nr_swap_pages); + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); + if (si->flags & SWP_BLKDEV) + swap_slot_free_notify = + si->bdev->bd_disk->fops->swap_slot_free_notify; + else + swap_slot_free_notify = NULL; + while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); + frontswap_invalidate_page(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; + } + clear_shadow_from_swap_cache(si->type, begin, end); +} + +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + +static bool swap_offset_available_and_locked(struct swap_info_struct *si, + unsigned long offset) +{ + if (data_race(!si->swap_map[offset])) { + spin_lock(&si->lock); + return true; + } + + if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { + spin_lock(&si->lock); + return true; + } + + return false; +} + +static int scan_swap_map_slots(struct swap_info_struct *si, + unsigned char usage, int nr, + swp_entry_t slots[]) +{ + struct swap_cluster_info *ci; + unsigned long offset; + unsigned long scan_base; + unsigned long last_in_cluster = 0; + int latency_ration = LATENCY_LIMIT; + int n_ret = 0; + bool scanned_many = false; + + /* + * We try to cluster swap pages by allocating them sequentially + * in swap. Once we've allocated SWAPFILE_CLUSTER pages this + * way, however, we resort to first-free allocation, starting + * a new cluster. This prevents us from scattering swap pages + * all over the entire swap partition, so that we reduce + * overall disk seek times between swap pages. -- sct + * But we do now try to find an empty cluster. -Andrea + * And we let swap pages go all over an SSD partition. Hugh + */ + + si->flags += SWP_SCANNING; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; + + /* SSD algorithm */ + if (si->cluster_info) { + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto scan; + } else if (unlikely(!si->cluster_nr--)) { + if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + + spin_unlock(&si->lock); + + /* + * If seek is expensive, start searching for new cluster from + * start of partition, to minimize the span of allocated swap. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. + */ + scan_base = offset = si->lowest_bit; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + + /* Locate the first empty (unaligned) cluster */ + for (; last_in_cluster <= si->highest_bit; offset++) { + if (si->swap_map[offset]) + last_in_cluster = offset + SWAPFILE_CLUSTER; + else if (offset == last_in_cluster) { + spin_lock(&si->lock); + offset -= SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + goto checks; + } + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + } + } + + offset = scan_base; + spin_lock(&si->lock); + si->cluster_nr = SWAPFILE_CLUSTER - 1; + } + +checks: + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + /* take a break if we already got some slots */ + if (n_ret) + goto done; + if (!scan_swap_map_try_ssd_cluster(si, &offset, + &scan_base)) + goto scan; + } + } + if (!(si->flags & SWP_WRITEOK)) + goto no_page; + if (!si->highest_bit) + goto no_page; + if (offset > si->highest_bit) + scan_base = offset = si->lowest_bit; + + ci = lock_cluster(si, offset); + /* reuse swap entry of cache-only swap if not busy. */ + if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + int swap_was_freed; + unlock_cluster(ci); + spin_unlock(&si->lock); + swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + spin_lock(&si->lock); + /* entry was freed successfully, try to use this again */ + if (swap_was_freed) + goto checks; + goto scan; /* check next one */ + } + + if (si->swap_map[offset]) { + unlock_cluster(ci); + if (!n_ret) + goto scan; + else + goto done; + } + WRITE_ONCE(si->swap_map[offset], usage); + inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); + + swap_range_alloc(si, offset, 1); + slots[n_ret++] = swp_entry(si->type, offset); + + /* got enough slots or reach max slots? */ + if ((n_ret == nr) || (offset >= si->highest_bit)) + goto done; + + /* search for next available slot */ + + /* time to take a break? */ + if (unlikely(--latency_ration < 0)) { + if (n_ret) + goto done; + spin_unlock(&si->lock); + cond_resched(); + spin_lock(&si->lock); + latency_ration = LATENCY_LIMIT; + } + + /* try to get more slots in cluster */ + if (si->cluster_info) { + if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) + goto checks; + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ + --si->cluster_nr; + goto checks; + } + + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + +done: + set_cluster_next(si, offset + 1); + si->flags -= SWP_SCANNING; + return n_ret; + +scan: + spin_unlock(&si->lock); + while (++offset <= READ_ONCE(si->highest_bit)) { + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + scanned_many = true; + } + if (swap_offset_available_and_locked(si, offset)) + goto checks; + } + offset = si->lowest_bit; + while (offset < scan_base) { + if (unlikely(--latency_ration < 0)) { + cond_resched(); + latency_ration = LATENCY_LIMIT; + scanned_many = true; + } + if (swap_offset_available_and_locked(si, offset)) + goto checks; + offset++; + } + spin_lock(&si->lock); + +no_page: + si->flags -= SWP_SCANNING; + return n_ret; +} + +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + unsigned long idx; + struct swap_cluster_info *ci; + unsigned long offset; + + /* + * Should not even be attempting cluster allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) { + VM_WARN_ON_ONCE(1); + return 0; + } + + if (cluster_list_empty(&si->free_clusters)) + return 0; + + idx = cluster_list_first(&si->free_clusters); + offset = idx * SWAPFILE_CLUSTER; + ci = lock_cluster(si, offset); + alloc_cluster(si, idx); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); + + memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); + unlock_cluster(ci); + swap_range_alloc(si, offset, SWAPFILE_CLUSTER); + *slot = swp_entry(si->type, offset); + + return 1; +} + +static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + unsigned long offset = idx * SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER); + cluster_set_count_flag(ci, 0, 0); + free_cluster(si, idx); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +} + +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) +{ + unsigned long size = swap_entry_size(entry_size); + struct swap_info_struct *si, *next; + long avail_pgs; + int n_ret = 0; + int node; + + /* Only single cluster request supported */ + WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); + + spin_lock(&swap_avail_lock); + + avail_pgs = atomic_long_read(&nr_swap_pages) / size; + if (avail_pgs <= 0) { + spin_unlock(&swap_avail_lock); + goto noswap; + } + + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); + + atomic_long_sub(n_goal * size, &nr_swap_pages); + +start_over: + node = numa_node_id(); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + spin_unlock(&swap_avail_lock); + spin_lock(&si->lock); + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_lists[node])) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + __del_from_avail_list(si); + spin_unlock(&si->lock); + goto nextsi; + } + if (size == SWAPFILE_CLUSTER) { + if (si->flags & SWP_BLKDEV) + n_ret = swap_alloc_cluster(si, swp_entries); + } else + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); + spin_unlock(&si->lock); + if (n_ret || size == SWAPFILE_CLUSTER) + goto check_out; + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + cond_resched(); + + spin_lock(&swap_avail_lock); +nextsi: + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map_slots() can drop the si->lock, + * multiple callers probably all tried to get a page from the + * same si and it filled up before we could get one; or, the si + * filled up between us dropping swap_avail_lock and taking + * si->lock. Since we dropped the swap_avail_lock, the + * swap_avail_head list may have been modified; so if next is + * still in the swap_avail_head list then try it, otherwise + * start over if we have not gotten any slots. + */ + if (plist_node_empty(&next->avail_lists[node])) + goto start_over; + } + + spin_unlock(&swap_avail_lock); + +check_out: + if (n_ret < n_goal) + atomic_long_add((long)(n_goal - n_ret) * size, + &nr_swap_pages); +noswap: + return n_ret; +} + +static struct swap_info_struct *_swap_info_get(swp_entry_t entry) +{ + struct swap_info_struct *p; + unsigned long offset; + + if (!entry.val) + goto out; + p = swp_swap_info(entry); + if (!p) + goto bad_nofile; + if (data_race(!(p->flags & SWP_USED))) + goto bad_device; + offset = swp_offset(entry); + if (offset >= p->max) + goto bad_offset; + if (data_race(!p->swap_map[swp_offset(entry)])) + goto bad_free; + return p; + +bad_free: + pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); + goto out; +bad_offset: + pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); + goto out; +bad_device: + pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); + goto out; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +} + +static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, + struct swap_info_struct *q) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + + if (p != q) { + if (q != NULL) + spin_unlock(&q->lock); + if (p != NULL) + spin_lock(&p->lock); + } + return p; +} + +static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, + unsigned long offset, + unsigned char usage) +{ + unsigned char count; + unsigned char has_cache; + + count = p->swap_map[offset]; + + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + + if (usage == SWAP_HAS_CACHE) { + VM_BUG_ON(!has_cache); + has_cache = 0; + } else if (count == SWAP_MAP_SHMEM) { + /* + * Or we could insist on shmem.c using a special + * swap_shmem_free() and free_shmem_swap_and_cache()... + */ + count = 0; + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if (count == COUNT_CONTINUED) { + if (swap_count_continued(p, offset, count)) + count = SWAP_MAP_MAX | COUNT_CONTINUED; + else + count = SWAP_MAP_MAX; + } else + count--; + } + + usage = count | has_cache; + if (usage) + WRITE_ONCE(p->swap_map[offset], usage); + else + WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE); + + return usage; +} + +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + * + * Notice that swapoff or swapoff+swapon can still happen before the + * percpu_ref_tryget_live() in get_swap_device() or after the + * percpu_ref_put() in put_swap_device() if there isn't any other way + * to prevent swapoff, such as page lock, page table lock, etc. The + * caller must be prepared for that. For example, the following + * situation is possible. + * + * CPU1 CPU2 + * do_swap_page() + * ... swapoff+swapon + * __read_swap_cache_async() + * swapcache_prepare() + * __swap_duplicate() + * // check swap_map + * // verify PTE not changed + * + * In __swap_duplicate(), the swap_map need to be checked before + * changing partly because the specified swap entry may be for another + * swap device which has been swapoff. And in do_swap_page(), after + * the page is read from the swap device, the PTE is verified not + * changed with the page table locked to check whether the swap device + * has been swapoff or swapoff+swapon. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long offset; + + if (!entry.val) + goto out; + si = swp_swap_info(entry); + if (!si) + goto bad_nofile; + if (!percpu_ref_tryget_live(&si->users)) + goto out; + /* + * Guarantee the si->users are checked before accessing other + * fields of swap_info_struct. + * + * Paired with the spin_unlock() after setup_swap_info() in + * enable_swap_info(). + */ + smp_rmb(); + offset = swp_offset(entry); + if (offset >= si->max) + goto put_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +put_out: + pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); + percpu_ref_put(&si->users); + return NULL; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char usage; + + ci = lock_cluster_or_swap_info(p, offset); + usage = __swap_entry_free_locked(p, offset, 1); + unlock_cluster_or_swap_info(p, ci); + if (!usage) + free_swap_slot(entry); + + return usage; +} + +static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + unsigned char count; + + ci = lock_cluster(p, offset); + count = p->swap_map[offset]; + VM_BUG_ON(count != SWAP_HAS_CACHE); + p->swap_map[offset] = 0; + dec_cluster_info_page(p, p->cluster_info, offset); + unlock_cluster(ci); + + mem_cgroup_uncharge_swap(entry, 1); + swap_range_free(p, offset, 1); +} + +/* + * Caller has made sure that the swap device corresponding to entry + * is still around or has not been recycled. + */ +void swap_free(swp_entry_t entry) +{ + struct swap_info_struct *p; + + p = _swap_info_get(entry); + if (p) + __swap_entry_free(p, entry); +} + +/* + * Called after dropping swapcache to decrease refcnt to swap entries. + */ +void put_swap_folio(struct folio *folio, swp_entry_t entry) +{ + unsigned long offset = swp_offset(entry); + unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; + unsigned int i, free_entries = 0; + unsigned char val; + int size = swap_entry_size(folio_nr_pages(folio)); + + si = _swap_info_get(entry); + if (!si) + return; + + ci = lock_cluster_or_swap_info(si, offset); + if (size == SWAPFILE_CLUSTER) { + VM_BUG_ON(!cluster_is_huge(ci)); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + cluster_clear_huge(ci); + if (free_entries == SWAPFILE_CLUSTER) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + return; + } + } + for (i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); + if (i == size - 1) + return; + lock_cluster_or_swap_info(si, offset); + } + } + unlock_cluster_or_swap_info(si, ci); +} + +#ifdef CONFIG_THP_SWAP +int split_swap_cluster(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = _swap_info_get(entry); + if (!si) + return -EBUSY; + ci = lock_cluster(si, offset); + cluster_clear_huge(ci); + unlock_cluster(ci); + return 0; +} +#endif + +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ + const swp_entry_t *e1 = ent1, *e2 = ent2; + + return (int)swp_type(*e1) - (int)swp_type(*e2); +} + +void swapcache_free_entries(swp_entry_t *entries, int n) +{ + struct swap_info_struct *p, *prev; + int i; + + if (n <= 0) + return; + + prev = NULL; + p = NULL; + + /* + * Sort swap entries by swap device, so each lock is only taken once. + * nr_swapfiles isn't absolutely correct, but the overhead of sort() is + * so low that it isn't necessary to optimize further. + */ + if (nr_swapfiles > 1) + sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); + for (i = 0; i < n; ++i) { + p = swap_info_get_cont(entries[i], prev); + if (p) + swap_entry_free(p, entries[i]); + prev = p; + } + if (p) + spin_unlock(&p->lock); +} + +int __swap_count(swp_entry_t entry) +{ + struct swap_info_struct *si; + pgoff_t offset = swp_offset(entry); + int count = 0; + + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; +} + +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +{ + pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci; + int count; + + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + return count; +} + +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ +int __swp_swapcount(swp_entry_t entry) +{ + int count = 0; + struct swap_info_struct *si; + + si = get_swap_device(entry); + if (si) { + count = swap_swapcount(si, entry); + put_swap_device(si); + } + return count; +} + +/* + * How many references to @entry are currently swapped out? + * This considers COUNT_CONTINUED so it returns exact answer. + */ +int swp_swapcount(swp_entry_t entry) +{ + int count, tmp_count, n; + struct swap_info_struct *p; + struct swap_cluster_info *ci; + struct page *page; + pgoff_t offset; + unsigned char *map; + + p = _swap_info_get(entry); + if (!p) + return 0; + + offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + + count = swap_count(p->swap_map[offset]); + if (!(count & COUNT_CONTINUED)) + goto out; + + count &= ~COUNT_CONTINUED; + n = SWAP_MAP_MAX + 1; + + page = vmalloc_to_page(p->swap_map + offset); + offset &= ~PAGE_MASK; + VM_BUG_ON(page_private(page) != SWP_CONTINUED); + + do { + page = list_next_entry(page, lru); + map = kmap_atomic(page); + tmp_count = map[offset]; + kunmap_atomic(map); + + count += (tmp_count & ~COUNT_CONTINUED) * n; + n *= (SWAP_CONT_MAX + 1); + } while (tmp_count & COUNT_CONTINUED); +out: + unlock_cluster_or_swap_info(p, ci); + return count; +} + +static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + unsigned long roffset = swp_offset(entry); + unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + int i; + bool ret = false; + + ci = lock_cluster_or_swap_info(si, offset); + if (!ci || !cluster_is_huge(ci)) { + if (swap_count(map[roffset])) + ret = true; + goto unlock_out; + } + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (swap_count(map[offset + i])) { + ret = true; + break; + } + } +unlock_out: + unlock_cluster_or_swap_info(si, ci); + return ret; +} + +static bool folio_swapped(struct folio *folio) +{ + swp_entry_t entry = folio_swap_entry(folio); + struct swap_info_struct *si = _swap_info_get(entry); + + if (!si) + return false; + + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) + return swap_swapcount(si, entry) != 0; + + return swap_page_trans_huge_swapped(si, entry); +} + +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. + */ +bool folio_free_swap(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (!folio_test_swapcache(folio)) + return false; + if (folio_test_writeback(folio)) + return false; + if (folio_swapped(folio)) + return false; + + /* + * Once hibernation has begun to create its image of memory, + * there's a danger that one of the calls to folio_free_swap() + * - most probably a call from __try_to_reclaim_swap() while + * hibernation is allocating its own swap pages for the image, + * but conceivably even a call from memory reclaim - will free + * the swap from a folio which has already been recorded in the + * image as a clean swapcache folio, and then reuse its swap for + * another page of the image. On waking from hibernation, the + * original folio might be freed under memory pressure, then + * later read back in from swap, now with the wrong data. + * + * Hibernation suspends storage while it is writing the image + * to disk so check that here. + */ + if (pm_suspended_storage()) + return false; + + delete_from_swap_cache(folio); + folio_set_dirty(folio); + return true; +} + +/* + * Free the swap entry like above, but also try to + * free the page cache entry if it is the last user. + */ +int free_swap_and_cache(swp_entry_t entry) +{ + struct swap_info_struct *p; + unsigned char count; + + if (non_swap_entry(entry)) + return 1; + + p = _swap_info_get(entry); + if (p) { + count = __swap_entry_free(p, entry); + if (count == SWAP_HAS_CACHE && + !swap_page_trans_huge_swapped(p, entry)) + __try_to_reclaim_swap(p, swp_offset(entry), + TTRS_UNMAPPED | TTRS_FULL); + } + return p != NULL; +} + +#ifdef CONFIG_HIBERNATION + +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si = swap_type_to_swap_info(type); + swp_entry_t entry = {0}; + + if (!si) + goto fail; + + /* This is called for allocating swap entry, not cache */ + spin_lock(&si->lock); + if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry)) + atomic_long_dec(&nr_swap_pages); + spin_unlock(&si->lock); +fail: + return entry; +} + +/* + * Find the swap type that corresponds to given device (if any). + * + * @offset - number of the PAGE_SIZE-sized block of the device, starting + * from 0, in which the swap header is expected to be located. + * + * This is needed for the suspend to disk (aka swsusp). + */ +int swap_type_of(dev_t device, sector_t offset) +{ + int type; + + if (!device) + return -1; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; + + if (!(sis->flags & SWP_WRITEOK)) + continue; + + if (device == sis->bdev->bd_dev) { + struct swap_extent *se = first_se(sis); + + if (se->start_block == offset) { + spin_unlock(&swap_lock); + return type; + } + } + } + spin_unlock(&swap_lock); + return -ENODEV; +} + +int find_first_swap(dev_t *device) +{ + int type; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; + + if (!(sis->flags & SWP_WRITEOK)) + continue; + *device = sis->bdev->bd_dev; + spin_unlock(&swap_lock); + return type; + } + spin_unlock(&swap_lock); + return -ENODEV; +} + +/* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int type, pgoff_t offset) +{ + struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_extent *se; + + if (!si || !(si->flags & SWP_WRITEOK)) + return 0; + se = offset_to_swap_extent(si, offset); + return se->start_block + (offset - se->start_page); +} + +/* + * Return either the total number of swap pages of given type, or the number + * of free pages of that type (depending on @free) + * + * This is needed for software suspend + */ +unsigned int count_swap_pages(int type, int free) +{ + unsigned int n = 0; + + spin_lock(&swap_lock); + if ((unsigned int)type < nr_swapfiles) { + struct swap_info_struct *sis = swap_info[type]; + + spin_lock(&sis->lock); + if (sis->flags & SWP_WRITEOK) { + n = sis->pages; + if (free) + n -= sis->inuse_pages; + } + spin_unlock(&sis->lock); + } + spin_unlock(&swap_lock); + return n; +} +#endif /* CONFIG_HIBERNATION */ + +static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) +{ + return pte_same(pte_swp_clear_flags(pte), swp_pte); +} + +/* + * No need to decide whether this PTE shares the swap entry with others, + * just let do_wp_page work it out if a write is requested later - to + * force COW, vm_page_prot omits write permission from any private vma. + */ +static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, swp_entry_t entry, struct folio *folio) +{ + struct page *page = folio_file_page(folio, swp_offset(entry)); + struct page *swapcache; + spinlock_t *ptl; + pte_t *pte, new_pte; + int ret = 1; + + swapcache = page; + page = ksm_might_need_to_copy(page, vma, addr); + if (unlikely(!page)) + return -ENOMEM; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + ret = 0; + goto out; + } + + if (unlikely(!PageUptodate(page))) { + pte_t pteval; + + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + set_pte_at(vma->vm_mm, addr, pte, pteval); + swap_free(entry); + ret = 0; + goto out; + } + + /* See do_swap_page() */ + BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); + BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + get_page(page); + if (page == swapcache) { + rmap_t rmap_flags = RMAP_NONE; + + /* + * See do_swap_page(): PageWriteback() would be problematic. + * However, we do a wait_on_page_writeback() just before this + * call and have the page locked. + */ + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (pte_swp_exclusive(*pte)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(page, vma, addr, rmap_flags); + } else { /* ksm created a completely new copy */ + page_add_new_anon_rmap(page, vma, addr); + lru_cache_add_inactive_or_unevictable(page, vma); + } + new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (pte_swp_soft_dirty(*pte)) + new_pte = pte_mksoft_dirty(new_pte); + if (pte_swp_uffd_wp(*pte)) + new_pte = pte_mkuffd_wp(new_pte); + set_pte_at(vma->vm_mm, addr, pte, new_pte); + swap_free(entry); +out: + pte_unmap_unlock(pte, ptl); + if (page != swapcache) { + unlock_page(page); + put_page(page); + } + return ret; +} + +static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned int type) +{ + swp_entry_t entry; + pte_t *pte; + struct swap_info_struct *si; + int ret = 0; + volatile unsigned char *swap_map; + + si = swap_info[type]; + pte = pte_offset_map(pmd, addr); + do { + struct folio *folio; + unsigned long offset; + + if (!is_swap_pte(*pte)) + continue; + + entry = pte_to_swp_entry(*pte); + if (swp_type(entry) != type) + continue; + + offset = swp_offset(entry); + pte_unmap(pte); + swap_map = &si->swap_map[offset]; + folio = swap_cache_get_folio(entry, vma, addr); + if (!folio) { + struct page *page; + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .real_address = addr, + .pmd = pmd, + }; + + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + if (page) + folio = page_folio(page); + } + if (!folio) { + if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) + goto try_next; + return -ENOMEM; + } + + folio_lock(folio); + folio_wait_writeback(folio); + ret = unuse_pte(vma, pmd, addr, entry, folio); + if (ret < 0) { + folio_unlock(folio); + folio_put(folio); + goto out; + } + + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); +try_next: + pte = pte_offset_map(pmd, addr); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap(pte - 1); + + ret = 0; +out: + return ret; +} + +static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned int type) +{ + pmd_t *pmd; + unsigned long next; + int ret; + + pmd = pmd_offset(pud, addr); + do { + cond_resched(); + next = pmd_addr_end(addr, end); + if (pmd_none_or_trans_huge_or_clear_bad(pmd)) + continue; + ret = unuse_pte_range(vma, pmd, addr, next, type); + if (ret) + return ret; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, + unsigned long addr, unsigned long end, + unsigned int type) +{ + pud_t *pud; + unsigned long next; + int ret; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + ret = unuse_pmd_range(vma, pud, addr, next, type); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + return 0; +} + +static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned int type) +{ + p4d_t *p4d; + unsigned long next; + int ret; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + ret = unuse_pud_range(vma, p4d, addr, next, type); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + return 0; +} + +static int unuse_vma(struct vm_area_struct *vma, unsigned int type) +{ + pgd_t *pgd; + unsigned long addr, end, next; + int ret; + + addr = vma->vm_start; + end = vma->vm_end; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + ret = unuse_p4d_range(vma, pgd, addr, next, type); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + return 0; +} + +static int unuse_mm(struct mm_struct *mm, unsigned int type) +{ + struct vm_area_struct *vma; + int ret = 0; + VMA_ITERATOR(vmi, mm, 0); + + mmap_read_lock(mm); + for_each_vma(vmi, vma) { + if (vma->anon_vma) { + ret = unuse_vma(vma, type); + if (ret) + break; + } + + cond_resched(); + } + mmap_read_unlock(mm); + return ret; +} + +/* + * Scan swap_map from current position to next entry still in use. + * Return 0 if there are no inuse entries after prev till end of + * the map. + */ +static unsigned int find_next_to_unuse(struct swap_info_struct *si, + unsigned int prev) +{ + unsigned int i; + unsigned char count; + + /* + * No need for swap_lock here: we're just looking + * for whether an entry is in use, not modifying it; false + * hits are okay, and sys_swapoff() has already prevented new + * allocations from this area (while holding swap_lock). + */ + for (i = prev + 1; i < si->max; i++) { + count = READ_ONCE(si->swap_map[i]); + if (count && swap_count(count) != SWAP_MAP_BAD) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); + } + + if (i == si->max) + i = 0; + + return i; +} + +static int try_to_unuse(unsigned int type) +{ + struct mm_struct *prev_mm; + struct mm_struct *mm; + struct list_head *p; + int retval = 0; + struct swap_info_struct *si = swap_info[type]; + struct folio *folio; + swp_entry_t entry; + unsigned int i; + + if (!READ_ONCE(si->inuse_pages)) + return 0; + +retry: + retval = shmem_unuse(type); + if (retval) + return retval; + + prev_mm = &init_mm; + mmget(prev_mm); + + spin_lock(&mmlist_lock); + p = &init_mm.mmlist; + while (READ_ONCE(si->inuse_pages) && + !signal_pending(current) && + (p = p->next) != &init_mm.mmlist) { + + mm = list_entry(p, struct mm_struct, mmlist); + if (!mmget_not_zero(mm)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + retval = unuse_mm(mm, type); + if (retval) { + mmput(prev_mm); + return retval; + } + + /* + * Make sure that we aren't completely killing + * interactive performance. + */ + cond_resched(); + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + + mmput(prev_mm); + + i = 0; + while (READ_ONCE(si->inuse_pages) && + !signal_pending(current) && + (i = find_next_to_unuse(si, i)) != 0) { + + entry = swp_entry(type, i); + folio = filemap_get_folio(swap_address_space(entry), i); + if (!folio) + continue; + + /* + * It is conceivable that a racing task removed this folio from + * swap cache just before we acquired the page lock. The folio + * might even be back in swap cache on another swap area. But + * that is okay, folio_free_swap() only removes stale folios. + */ + folio_lock(folio); + folio_wait_writeback(folio); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); + } + + /* + * Lets check again to see if there are still swap entries in the map. + * If yes, we would need to do retry the unuse logic again. + * Under global memory pressure, swap entries can be reinserted back + * into process space after the mmlist loop above passes over them. + * + * Limit the number of retries? No: when mmget_not_zero() + * above fails, that mm is likely to be freeing swap from + * exit_mmap(), which proceeds at its own independent pace; + * and even shmem_writepage() could have been preempted after + * folio_alloc_swap(), temporarily hiding that swap. It's easy + * and robust (though cpu-intensive) just to keep retrying. + */ + if (READ_ONCE(si->inuse_pages)) { + if (!signal_pending(current)) + goto retry; + return -EINTR; + } + + return 0; +} + +/* + * After a successful try_to_unuse, if no swap is now in use, we know + * we can empty the mmlist. swap_lock must be held on entry and exit. + * Note that mmlist_lock nests inside swap_lock, and an mm must be + * added to the mmlist just after page_duplicate - before would be racy. + */ +static void drain_mmlist(void) +{ + struct list_head *p, *next; + unsigned int type; + + for (type = 0; type < nr_swapfiles; type++) + if (swap_info[type]->inuse_pages) + return; + spin_lock(&mmlist_lock); + list_for_each_safe(p, next, &init_mm.mmlist) + list_del_init(p); + spin_unlock(&mmlist_lock); +} + +/* + * Free all of a swapdev's extent information + */ +static void destroy_swap_extents(struct swap_info_struct *sis) +{ + while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) { + struct rb_node *rb = sis->swap_extent_root.rb_node; + struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node); + + rb_erase(rb, &sis->swap_extent_root); + kfree(se); + } + + if (sis->flags & SWP_ACTIVATED) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + sis->flags &= ~SWP_ACTIVATED; + if (mapping->a_ops->swap_deactivate) + mapping->a_ops->swap_deactivate(swap_file); + } +} + +/* + * Add a block range (and the corresponding page range) into this swapdev's + * extent tree. + * + * This function rather assumes that it is called in ascending page order. + */ +int +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL; + struct swap_extent *se; + struct swap_extent *new_se; + + /* + * place the new node at the right most since the + * function is called in ascending page order. + */ + while (*link) { + parent = *link; + link = &parent->rb_right; + } + + if (parent) { + se = rb_entry(parent, struct swap_extent, rb_node); + BUG_ON(se->start_page + se->nr_pages != start_page); + if (se->start_block + se->nr_pages == start_block) { + /* Merge it */ + se->nr_pages += nr_pages; + return 0; + } + } + + /* No merge, insert a new extent. */ + new_se = kmalloc(sizeof(*se), GFP_KERNEL); + if (new_se == NULL) + return -ENOMEM; + new_se->start_page = start_page; + new_se->nr_pages = nr_pages; + new_se->start_block = start_block; + + rb_link_node(&new_se->rb_node, parent, link); + rb_insert_color(&new_se->rb_node, &sis->swap_extent_root); + return 1; +} +EXPORT_SYMBOL_GPL(add_swap_extent); + +/* + * A `swap extent' is a simple thing which maps a contiguous range of pages + * onto a contiguous range of disk blocks. A rbtree of swap extents is + * built at swapon time and is then used at swap_writepage/swap_readpage + * time for locating where on disk a page belongs. + * + * If the swapfile is an S_ISBLK block device, a single extent is installed. + * This is done so that the main operating code can treat S_ISBLK and S_ISREG + * swap files identically. + * + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap + * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * swapfiles are handled *identically* after swapon time. + * + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks + * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray + * blocks are found which do not fall within the PAGE_SIZE alignment + * requirements, they are simply tossed out - we will never use those blocks + * for swapping. + * + * For all swap devices we set S_SWAPFILE across the life of the swapon. This + * prevents users from writing to the swap device, which will corrupt memory. + * + * The amount of disk space which a single swap extent represents varies. + * Typically it is in the 1-4 megabyte range. So we can have hundreds of + * extents in the rbtree. - akpm. + */ +static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) +{ + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + int ret; + + if (S_ISBLK(inode->i_mode)) { + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + return ret; + } + + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(sis, swap_file, span); + if (ret < 0) + return ret; + sis->flags |= SWP_ACTIVATED; + if ((sis->flags & SWP_FS_OPS) && + sio_pool_init() != 0) { + destroy_swap_extents(sis); + return -ENOMEM; + } + return ret; + } + + return generic_swapfile_activate(sis, swap_file, span); +} + +static int swap_node(struct swap_info_struct *p) +{ + struct block_device *bdev; + + if (p->bdev) + bdev = p->bdev; + else + bdev = p->swap_file->f_inode->i_sb->s_bdev; + + return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; +} + +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) +{ + int i; + + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + for_each_node(i) { + if (p->prio >= 0) + p->avail_lists[i].prio = -p->prio; + else { + if (swap_node(p) == i) + p->avail_lists[i].prio = 1; + else + p->avail_lists[i].prio = -p->prio; + } + } + p->swap_map = swap_map; + p->cluster_info = cluster_info; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK; + atomic_long_add(p->pages, &nr_swap_pages); + total_swap_pages += p->pages; + + assert_spin_locked(&swap_lock); + /* + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for folio_alloc_swap(), + * which allocates swap pages from the highest available priority + * swap_info_struct. + */ + plist_add(&p->list, &swap_active_head); + add_to_avail_list(p); +} + +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long *frontswap_map) +{ + if (IS_ENABLED(CONFIG_FRONTSWAP)) + frontswap_init(p->type, frontswap_map); + spin_lock(&swap_lock); + spin_lock(&p->lock); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Finished initializing swap device, now it's safe to reference it. + */ + percpu_ref_resurrect(&p->users); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +} + +static void reinsert_swap_info(struct swap_info_struct *p) +{ + spin_lock(&swap_lock); + spin_lock(&p->lock); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); +} + +bool has_usable_swap(void) +{ + bool ret = true; + + spin_lock(&swap_lock); + if (plist_head_empty(&swap_active_head)) + ret = false; + spin_unlock(&swap_lock); + return ret; +} + +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) +{ + struct swap_info_struct *p = NULL; + unsigned char *swap_map; + struct swap_cluster_info *cluster_info; + unsigned long *frontswap_map; + struct file *swap_file, *victim; + struct address_space *mapping; + struct inode *inode; + struct filename *pathname; + int err, found = 0; + unsigned int old_block_size; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + BUG_ON(!current->mm); + + pathname = getname(specialfile); + if (IS_ERR(pathname)) + return PTR_ERR(pathname); + + victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); + err = PTR_ERR(victim); + if (IS_ERR(victim)) + goto out; + + mapping = victim->f_mapping; + spin_lock(&swap_lock); + plist_for_each_entry(p, &swap_active_head, list) { + if (p->flags & SWP_WRITEOK) { + if (p->swap_file->f_mapping == mapping) { + found = 1; + break; + } + } + } + if (!found) { + err = -EINVAL; + spin_unlock(&swap_lock); + goto out_dput; + } + if (!security_vm_enough_memory_mm(current->mm, p->pages)) + vm_unacct_memory(p->pages); + else { + err = -ENOMEM; + spin_unlock(&swap_lock); + goto out_dput; + } + spin_lock(&p->lock); + del_from_avail_list(p); + if (p->prio < 0) { + struct swap_info_struct *si = p; + int nid; + + plist_for_each_entry_continue(si, &swap_active_head, list) { + si->prio++; + si->list.prio--; + for_each_node(nid) { + if (si->avail_lists[nid].prio != 1) + si->avail_lists[nid].prio--; + } + } + least_priority++; + } + plist_del(&p->list, &swap_active_head); + atomic_long_sub(p->pages, &nr_swap_pages); + total_swap_pages -= p->pages; + p->flags &= ~SWP_WRITEOK; + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + + disable_swap_slots_cache_lock(); + + set_current_oom_origin(); + err = try_to_unuse(p->type); + clear_current_oom_origin(); + + if (err) { + /* re-insert swap space back into swap_list */ + reinsert_swap_info(p); + reenable_swap_slots_cache_unlock(); + goto out_dput; + } + + reenable_swap_slots_cache_unlock(); + + /* + * Wait for swap operations protected by get/put_swap_device() + * to complete. + * + * We need synchronize_rcu() here to protect the accessing to + * the swap cache data structure. + */ + percpu_ref_kill(&p->users); + synchronize_rcu(); + wait_for_completion(&p->comp); + + flush_work(&p->discard_work); + + destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) + free_swap_count_continuations(p); + + if (!p->bdev || !bdev_nonrot(p->bdev)) + atomic_dec(&nr_rotate_swap); + + mutex_lock(&swapon_mutex); + spin_lock(&swap_lock); + spin_lock(&p->lock); + drain_mmlist(); + + /* wait for anyone still in scan_swap_map_slots */ + p->highest_bit = 0; /* cuts scans short */ + while (p->flags >= SWP_SCANNING) { + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + schedule_timeout_uninterruptible(1); + spin_lock(&swap_lock); + spin_lock(&p->lock); + } + + swap_file = p->swap_file; + old_block_size = p->old_block_size; + p->swap_file = NULL; + p->max = 0; + swap_map = p->swap_map; + p->swap_map = NULL; + cluster_info = p->cluster_info; + p->cluster_info = NULL; + frontswap_map = frontswap_map_get(p); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + arch_swap_invalidate_area(p->type); + frontswap_invalidate_area(p->type); + frontswap_map_set(p, NULL); + mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; + vfree(swap_map); + kvfree(cluster_info); + kvfree(frontswap_map); + /* Destroy swap account information */ + swap_cgroup_swapoff(p->type); + exit_swap_address_space(p->type); + + inode = mapping->host; + if (S_ISBLK(inode->i_mode)) { + struct block_device *bdev = I_BDEV(inode); + + set_blocksize(bdev, old_block_size); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + } + + inode_lock(inode); + inode->i_flags &= ~S_SWAPFILE; + inode_unlock(inode); + filp_close(swap_file, NULL); + + /* + * Clear the SWP_USED flag after all resources are freed so that swapon + * can reuse this swap_info in alloc_swap_info() safely. It is ok to + * not hold p->lock after we cleared its SWP_WRITEOK. + */ + spin_lock(&swap_lock); + p->flags = 0; + spin_unlock(&swap_lock); + + err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + +out_dput: + filp_close(victim, NULL); +out: + putname(pathname); + return err; +} + +#ifdef CONFIG_PROC_FS +static __poll_t swaps_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); + return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI; + } + + return EPOLLIN | EPOLLRDNORM; +} + +/* iterator */ +static void *swap_start(struct seq_file *swap, loff_t *pos) +{ + struct swap_info_struct *si; + int type; + loff_t l = *pos; + + mutex_lock(&swapon_mutex); + + if (!l) + return SEQ_START_TOKEN; + + for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + if (!(si->flags & SWP_USED) || !si->swap_map) + continue; + if (!--l) + return si; + } + + return NULL; +} + +static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) +{ + struct swap_info_struct *si = v; + int type; + + if (v == SEQ_START_TOKEN) + type = 0; + else + type = si->type + 1; + + ++(*pos); + for (; (si = swap_type_to_swap_info(type)); type++) { + if (!(si->flags & SWP_USED) || !si->swap_map) + continue; + return si; + } + + return NULL; +} + +static void swap_stop(struct seq_file *swap, void *v) +{ + mutex_unlock(&swapon_mutex); +} + +static int swap_show(struct seq_file *swap, void *v) +{ + struct swap_info_struct *si = v; + struct file *file; + int len; + unsigned long bytes, inuse; + + if (si == SEQ_START_TOKEN) { + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); + return 0; + } + + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); + + file = si->swap_file; + len = seq_file_path(swap, file, " \t\n\\"); + seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", + len < 40 ? 40 - len : 1, " ", + S_ISBLK(file_inode(file)->i_mode) ? + "partition" : "file\t", + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", + si->prio); + return 0; +} + +static const struct seq_operations swaps_op = { + .start = swap_start, + .next = swap_next, + .stop = swap_stop, + .show = swap_show +}; + +static int swaps_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &swaps_op); + if (ret) + return ret; + + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; +} + +static const struct proc_ops swaps_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = swaps_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, + .proc_poll = swaps_poll, +}; + +static int __init procswaps_init(void) +{ + proc_create("swaps", 0, NULL, &swaps_proc_ops); + return 0; +} +__initcall(procswaps_init); +#endif /* CONFIG_PROC_FS */ + +#ifdef MAX_SWAPFILES_CHECK +static int __init max_swapfiles_check(void) +{ + MAX_SWAPFILES_CHECK(); + return 0; +} +late_initcall(max_swapfiles_check); +#endif + +static struct swap_info_struct *alloc_swap_info(void) +{ + struct swap_info_struct *p; + struct swap_info_struct *defer = NULL; + unsigned int type; + int i; + + p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (percpu_ref_init(&p->users, swap_users_ref_free, + PERCPU_REF_INIT_DEAD, GFP_KERNEL)) { + kvfree(p); + return ERR_PTR(-ENOMEM); + } + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + if (!(swap_info[type]->flags & SWP_USED)) + break; + } + if (type >= MAX_SWAPFILES) { + spin_unlock(&swap_lock); + percpu_ref_exit(&p->users); + kvfree(p); + return ERR_PTR(-EPERM); + } + if (type >= nr_swapfiles) { + p->type = type; + /* + * Publish the swap_info_struct after initializing it. + * Note that kvzalloc() above zeroes all its fields. + */ + smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */ + nr_swapfiles++; + } else { + defer = p; + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() + * would be relying on p->type to remain valid. + */ + } + p->swap_extent_root = RB_ROOT; + plist_node_init(&p->list, 0); + for_each_node(i) + plist_node_init(&p->avail_lists[i], 0); + p->flags = SWP_USED; + spin_unlock(&swap_lock); + if (defer) { + percpu_ref_exit(&defer->users); + kvfree(defer); + } + spin_lock_init(&p->lock); + spin_lock_init(&p->cont_lock); + init_completion(&p->comp); + + return p; +} + +static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +{ + int error; + + if (S_ISBLK(inode->i_mode)) { + p->bdev = blkdev_get_by_dev(inode->i_rdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); + if (IS_ERR(p->bdev)) { + error = PTR_ERR(p->bdev); + p->bdev = NULL; + return error; + } + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); + if (error < 0) + return error; + /* + * Zoned block devices contain zones that have a sequential + * write only restriction. Hence zoned block devices are not + * suitable for swapping. Disallow them here. + */ + if (bdev_is_zoned(p->bdev)) + return -EINVAL; + p->flags |= SWP_BLKDEV; + } else if (S_ISREG(inode->i_mode)) { + p->bdev = inode->i_sb->s_bdev; + } + + return 0; +} + + +/* + * Find out how many pages are allowed for a single swap device. There + * are two limiting factors: + * 1) the number of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte, as defined by the different + * architectures. + * + * In order to find the largest possible bit mask, a swap entry with + * swap type 0 and swap offset ~0UL is created, encoded to a swap pte, + * decoded to a swp_entry_t again, and finally the swap offset is + * extracted. + * + * This will mask all the bits from the initial ~0UL mask that can't + * be encoded in either the swp_entry_t or the architecture definition + * of a swap pte. + */ +unsigned long generic_max_swapfile_size(void) +{ + return swp_offset(pte_to_swp_entry( + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; +} + +/* Can be overridden by an architecture for additional checks. */ +__weak unsigned long arch_max_swapfile_size(void) +{ + return generic_max_swapfile_size(); +} + +static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +{ + int i; + unsigned long maxpages; + unsigned long swapfilepages; + unsigned long last_page; + + if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { + pr_err("Unable to find swap-space signature\n"); + return 0; + } + + /* swap partition endianness hack... */ + if (swab32(swap_header->info.version) == 1) { + swab32s(&swap_header->info.version); + swab32s(&swap_header->info.last_page); + swab32s(&swap_header->info.nr_badpages); + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; + for (i = 0; i < swap_header->info.nr_badpages; i++) + swab32s(&swap_header->info.badpages[i]); + } + /* Check the swap header's sub-version */ + if (swap_header->info.version != 1) { + pr_warn("Unable to handle swap header version %d\n", + swap_header->info.version); + return 0; + } + + p->lowest_bit = 1; + p->cluster_next = 1; + p->cluster_nr = 0; + + maxpages = swapfile_maximum_size; + last_page = swap_header->info.last_page; + if (!last_page) { + pr_warn("Empty swap-file\n"); + return 0; + } + if (last_page > maxpages) { + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", + maxpages << (PAGE_SHIFT - 10), + last_page << (PAGE_SHIFT - 10)); + } + if (maxpages > last_page) { + maxpages = last_page + 1; + /* p->max is an unsigned int: don't overflow it */ + if ((unsigned int)maxpages == 0) + maxpages = UINT_MAX; + } + p->highest_bit = maxpages - 1; + + if (!maxpages) + return 0; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; + if (swapfilepages && maxpages > swapfilepages) { + pr_warn("Swap area shorter than signature indicates\n"); + return 0; + } + if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) + return 0; + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; + + return maxpages; +} + +#define SWAP_CLUSTER_INFO_COLS \ + DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) +#define SWAP_CLUSTER_SPACE_COLS \ + DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) +#define SWAP_CLUSTER_COLS \ + max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) + +static int setup_swap_map_and_extents(struct swap_info_struct *p, + union swap_header *swap_header, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info, + unsigned long maxpages, + sector_t *span) +{ + unsigned int j, k; + unsigned int nr_good_pages; + int nr_extents; + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; + unsigned long i, idx; + + nr_good_pages = maxpages - 1; /* omit header page */ + + cluster_list_init(&p->free_clusters); + cluster_list_init(&p->discard_clusters); + + for (i = 0; i < swap_header->info.nr_badpages; i++) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) + return -EINVAL; + if (page_nr < maxpages) { + swap_map[page_nr] = SWAP_MAP_BAD; + nr_good_pages--; + /* + * Haven't marked the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, page_nr); + } + } + + /* Haven't marked the cluster free yet, no list operation involved */ + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(p, cluster_info, i); + + if (nr_good_pages) { + swap_map[0] = SWAP_MAP_BAD; + /* + * Not mark the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, 0); + p->max = maxpages; + p->pages = nr_good_pages; + nr_extents = setup_swap_extents(p, span); + if (nr_extents < 0) + return nr_extents; + nr_good_pages = p->pages; + } + if (!nr_good_pages) { + pr_warn("Empty swap-file\n"); + return -EINVAL; + } + + if (!cluster_info) + return nr_extents; + + + /* + * Reduce false cache line sharing between cluster_info and + * sharing same address space. + */ + for (k = 0; k < SWAP_CLUSTER_COLS; k++) { + j = (k + col) % SWAP_CLUSTER_COLS; + for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { + idx = i * SWAP_CLUSTER_COLS + j; + if (idx >= nr_clusters) + continue; + if (cluster_count(&cluster_info[idx])) + continue; + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + cluster_list_add_tail(&p->free_clusters, cluster_info, + idx); + } + } + return nr_extents; +} + +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct *p; + struct filename *name; + struct file *swap_file = NULL; + struct address_space *mapping; + struct dentry *dentry; + int prio; + int error; + union swap_header *swap_header; + int nr_extents; + sector_t span; + unsigned long maxpages; + unsigned char *swap_map = NULL; + struct swap_cluster_info *cluster_info = NULL; + unsigned long *frontswap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + bool inced_nr_rotate_swap = false; + + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!swap_avail_heads) + return -ENOMEM; + + p = alloc_swap_info(); + if (IS_ERR(p)) + return PTR_ERR(p); + + INIT_WORK(&p->discard_work, swap_discard_work); + + name = getname(specialfile); + if (IS_ERR(name)) { + error = PTR_ERR(name); + name = NULL; + goto bad_swap; + } + swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swap_file)) { + error = PTR_ERR(swap_file); + swap_file = NULL; + goto bad_swap; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + dentry = swap_file->f_path.dentry; + inode = mapping->host; + + error = claim_swapfile(p, inode); + if (unlikely(error)) + goto bad_swap; + + inode_lock(inode); + if (d_unlinked(dentry) || cant_mount(dentry)) { + error = -ENOENT; + goto bad_swap_unlock_inode; + } + if (IS_SWAPFILE(inode)) { + error = -EBUSY; + goto bad_swap_unlock_inode; + } + + /* + * Read the swap header. + */ + if (!mapping->a_ops->read_folio) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } + page = read_mapping_page(mapping, 0, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap_unlock_inode; + } + swap_header = kmap(page); + + maxpages = read_swap_header(p, swap_header, inode); + if (unlikely(!maxpages)) { + error = -EINVAL; + goto bad_swap_unlock_inode; + } + + /* OK, set up the swap map and apply the bad block list */ + swap_map = vzalloc(maxpages); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } + + if (p->bdev && bdev_stable_writes(p->bdev)) + p->flags |= SWP_STABLE_WRITES; + + if (p->bdev && p->bdev->bd_disk->fops->rw_page) + p->flags |= SWP_SYNCHRONOUS_IO; + + if (p->bdev && bdev_nonrot(p->bdev)) { + int cpu; + unsigned long ci, nr_cluster; + + p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } + /* + * select a random position to start with to help wear leveling + * SSD + */ + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } + nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), + GFP_KERNEL); + if (!cluster_info) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } + + for (ci = 0; ci < nr_cluster; ci++) + spin_lock_init(&((cluster_info + ci)->lock)); + + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } + for_each_possible_cpu(cpu) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, cpu); + cluster_set_null(&cluster->index); + } + } else { + atomic_inc(&nr_rotate_swap); + inced_nr_rotate_swap = true; + } + + error = swap_cgroup_swapon(p->type, maxpages); + if (error) + goto bad_swap_unlock_inode; + + nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, + cluster_info, maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap_unlock_inode; + } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (IS_ENABLED(CONFIG_FRONTSWAP)) + frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), + sizeof(long), + GFP_KERNEL); + + if ((swap_flags & SWAP_FLAG_DISCARD) && + p->bdev && bdev_max_discard_sectors(p->bdev)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + pr_err("swapon: discard_swap(%p): %d\n", + p, err); + } + } + + error = init_swap_address_space(p->type, maxpages); + if (error) + goto bad_swap_unlock_inode; + + /* + * Flush any pending IO and dirty mappings before we start using this + * swap device. + */ + inode->i_flags |= S_SWAPFILE; + error = inode_drain_writes(inode); + if (error) { + inode->i_flags &= ~S_SWAPFILE; + goto free_swap_address_space; + } + + mutex_lock(&swapon_mutex); + prio = -1; + if (swap_flags & SWAP_FLAG_PREFER) + prio = + (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); + + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + p->pages<<(PAGE_SHIFT-10), name->name, p->prio, + nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + (p->flags & SWP_SOLIDSTATE) ? "SS" : "", + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", + (frontswap_map) ? "FS" : ""); + + mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + + error = 0; + goto out; +free_swap_address_space: + exit_swap_address_space(p->type); +bad_swap_unlock_inode: + inode_unlock(inode); +bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; + if (inode && S_ISBLK(inode->i_mode) && p->bdev) { + set_blocksize(p->bdev, p->old_block_size); + blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + } + inode = NULL; + destroy_swap_extents(p); + swap_cgroup_swapoff(p->type); + spin_lock(&swap_lock); + p->swap_file = NULL; + p->flags = 0; + spin_unlock(&swap_lock); + vfree(swap_map); + kvfree(cluster_info); + kvfree(frontswap_map); + if (inced_nr_rotate_swap) + atomic_dec(&nr_rotate_swap); + if (swap_file) + filp_close(swap_file, NULL); +out: + if (page && !IS_ERR(page)) { + kunmap(page); + put_page(page); + } + if (name) + putname(name); + if (inode) + inode_unlock(inode); + if (!error) + enable_swap_slots_cache(); + return error; +} + +void si_swapinfo(struct sysinfo *val) +{ + unsigned int type; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += READ_ONCE(si->inuse_pages); + } + val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; + val->totalswap = total_swap_pages + nr_to_be_unused; + spin_unlock(&swap_lock); +} + +/* + * Verify that a swap entry is valid and increment its swap map count. + * + * Returns error code in following case. + * - success -> 0 + * - swp_entry is invalid -> EINVAL + * - swp_entry is migration entry -> EINVAL + * - swap-cache reference is requested but there is already one. -> EEXIST + * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM + */ +static int __swap_duplicate(swp_entry_t entry, unsigned char usage) +{ + struct swap_info_struct *p; + struct swap_cluster_info *ci; + unsigned long offset; + unsigned char count; + unsigned char has_cache; + int err; + + p = get_swap_device(entry); + if (!p) + return -EINVAL; + + offset = swp_offset(entry); + ci = lock_cluster_or_swap_info(p, offset); + + count = p->swap_map[offset]; + + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + err = 0; + + if (usage == SWAP_HAS_CACHE) { + + /* set SWAP_HAS_CACHE if there is no cache and entry is used */ + if (!has_cache && count) + has_cache = SWAP_HAS_CACHE; + else if (has_cache) /* someone else added cache */ + err = -EEXIST; + else /* no users remaining */ + err = -ENOENT; + + } else if (count || has_cache) { + + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + count += usage; + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) + err = -EINVAL; + else if (swap_count_continued(p, offset, count)) + count = COUNT_CONTINUED; + else + err = -ENOMEM; + } else + err = -ENOENT; /* unused swap entry */ + + WRITE_ONCE(p->swap_map[offset], count | has_cache); + +unlock_out: + unlock_cluster_or_swap_info(p, ci); + put_swap_device(p); + return err; +} + +/* + * Help swapoff by noting that swap entry belongs to shmem/tmpfs + * (in which case its reference count is never incremented). + */ +void swap_shmem_alloc(swp_entry_t entry) +{ + __swap_duplicate(entry, SWAP_MAP_SHMEM); +} + +/* + * Increase reference count of swap entry by 1. + * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required + * but could not be atomically allocated. Returns 0, just as if it succeeded, + * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which + * might occur if a page table entry has got corrupted. + */ +int swap_duplicate(swp_entry_t entry) +{ + int err = 0; + + while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + return err; +} + +/* + * @entry: swap entry for which we allocate swap cache. + * + * Called when allocating swap cache for existing swap entry, + * This can return error codes. Returns 0 at success. + * -EEXIST means there is a swap cache. + * Note: return code is different from swap_duplicate(). + */ +int swapcache_prepare(swp_entry_t entry) +{ + return __swap_duplicate(entry, SWAP_HAS_CACHE); +} + +struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return swap_type_to_swap_info(swp_type(entry)); +} + +struct swap_info_struct *page_swap_info(struct page *page) +{ + swp_entry_t entry = { .val = page_private(page) }; + return swp_swap_info(entry); +} + +/* + * out-of-line methods to avoid include hell. + */ +struct address_space *swapcache_mapping(struct folio *folio) +{ + return page_swap_info(&folio->page)->swap_file->f_mapping; +} +EXPORT_SYMBOL_GPL(swapcache_mapping); + +pgoff_t __page_file_index(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + return swp_offset(swap); +} +EXPORT_SYMBOL_GPL(__page_file_index); + +/* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's + * page of the original vmalloc'ed swap_map, to hold the continuation count + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. + * + * These continuation pages are seldom referenced: the common paths all work + * on the original swap_map, only referring to a continuation page when the + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. + * + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) + * can be called after dropping locks. + */ +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + struct page *head; + struct page *page; + struct page *list_page; + pgoff_t offset; + unsigned char count; + int ret = 0; + + /* + * When debugging, it's easier to use __GFP_ZERO here; but it's better + * for latency not to zero a page while GFP_ATOMIC and holding locks. + */ + page = alloc_page(gfp_mask | __GFP_HIGHMEM); + + si = get_swap_device(entry); + if (!si) { + /* + * An acceptable race has occurred since the failing + * __swap_duplicate(): the swap device may be swapoff + */ + goto outer; + } + spin_lock(&si->lock); + + offset = swp_offset(entry); + + ci = lock_cluster(si, offset); + + count = swap_count(si->swap_map[offset]); + + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { + /* + * The higher the swap count, the more likely it is that tasks + * will race to add swap count continuation: we need to avoid + * over-provisioning. + */ + goto out; + } + + if (!page) { + ret = -ENOMEM; + goto out; + } + + /* + * We are fortunate that although vmalloc_to_page uses pte_offset_map, + * no architecture is using highmem pages for kernel page tables: so it + * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. + */ + head = vmalloc_to_page(si->swap_map + offset); + offset &= ~PAGE_MASK; + + spin_lock(&si->cont_lock); + /* + * Page allocation does not initialize the page's lru field, + * but it does always reset its private field. + */ + if (!page_private(head)) { + BUG_ON(count & COUNT_CONTINUED); + INIT_LIST_HEAD(&head->lru); + set_page_private(head, SWP_CONTINUED); + si->flags |= SWP_CONTINUED; + } + + list_for_each_entry(list_page, &head->lru, lru) { + unsigned char *map; + + /* + * If the previous map said no continuation, but we've found + * a continuation page, free our allocation and use this one. + */ + if (!(count & COUNT_CONTINUED)) + goto out_unlock_cont; + + map = kmap_atomic(list_page) + offset; + count = *map; + kunmap_atomic(map); + + /* + * If this continuation count now has some space in it, + * free our allocation and use this one. + */ + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) + goto out_unlock_cont; + } + + list_add_tail(&page->lru, &head->lru); + page = NULL; /* now it's attached, don't free it */ +out_unlock_cont: + spin_unlock(&si->cont_lock); +out: + unlock_cluster(ci); + spin_unlock(&si->lock); + put_swap_device(si); +outer: + if (page) + __free_page(page); + return ret; +} + +/* + * swap_count_continued - when the original swap_map count is incremented + * from SWAP_MAP_MAX, check if there is already a continuation page to carry + * into, carry if so, or else fail until a new continuation page is allocated; + * when the original swap_map count is decremented from 0 with continuation, + * borrow from the continuation and report whether it still holds more. + * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster + * lock. + */ +static bool swap_count_continued(struct swap_info_struct *si, + pgoff_t offset, unsigned char count) +{ + struct page *head; + struct page *page; + unsigned char *map; + bool ret; + + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head) != SWP_CONTINUED) { + BUG_ON(count & COUNT_CONTINUED); + return false; /* need to add count continuation */ + } + + spin_lock(&si->cont_lock); + offset &= ~PAGE_MASK; + page = list_next_entry(head, lru); + map = kmap_atomic(page) + offset; + + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ + goto init_map; /* jump over SWAP_CONT_MAX checks */ + + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ + /* + * Think of how you add 1 to 999 + */ + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { + kunmap_atomic(map); + page = list_next_entry(page, lru); + BUG_ON(page == head); + map = kmap_atomic(page) + offset; + } + if (*map == SWAP_CONT_MAX) { + kunmap_atomic(map); + page = list_next_entry(page, lru); + if (page == head) { + ret = false; /* add count continuation */ + goto out; + } + map = kmap_atomic(page) + offset; +init_map: *map = 0; /* we didn't zero the page */ + } + *map += 1; + kunmap_atomic(map); + while ((page = list_prev_entry(page, lru)) != head) { + map = kmap_atomic(page) + offset; + *map = COUNT_CONTINUED; + kunmap_atomic(map); + } + ret = true; /* incremented */ + + } else { /* decrementing */ + /* + * Think of how you subtract 1 from 1000 + */ + BUG_ON(count != COUNT_CONTINUED); + while (*map == COUNT_CONTINUED) { + kunmap_atomic(map); + page = list_next_entry(page, lru); + BUG_ON(page == head); + map = kmap_atomic(page) + offset; + } + BUG_ON(*map == 0); + *map -= 1; + if (*map == 0) + count = 0; + kunmap_atomic(map); + while ((page = list_prev_entry(page, lru)) != head) { + map = kmap_atomic(page) + offset; + *map = SWAP_CONT_MAX | count; + count = COUNT_CONTINUED; + kunmap_atomic(map); + } + ret = count == COUNT_CONTINUED; + } +out: + spin_unlock(&si->cont_lock); + return ret; +} + +/* + * free_swap_count_continuations - swapoff free all the continuation pages + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. + */ +static void free_swap_count_continuations(struct swap_info_struct *si) +{ + pgoff_t offset; + + for (offset = 0; offset < si->max; offset += PAGE_SIZE) { + struct page *head; + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head)) { + struct page *page, *next; + + list_for_each_entry_safe(page, next, &head->lru, lru) { + list_del(&page->lru); + __free_page(page); + } + } + } +} + +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) +{ + struct swap_info_struct *si, *next; + int nid = page_to_nid(page); + + if (!(gfp_mask & __GFP_IO)) + return; + + if (!blk_cgroup_congested()) + return; + + /* + * We've already scheduled a throttle, avoid taking the global swap + * lock. + */ + if (current->throttle_queue) + return; + + spin_lock(&swap_avail_lock); + plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], + avail_lists[nid]) { + if (si->bdev) { + blkcg_schedule_throttle(si->bdev->bd_disk, true); + break; + } + } + spin_unlock(&swap_avail_lock); +} +#endif + +static int __init swapfile_init(void) +{ + int nid; + + swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), + GFP_KERNEL); + if (!swap_avail_heads) { + pr_emerg("Not enough memory for swap heads, swap is disabled\n"); + return -ENOMEM; + } + + for_each_node(nid) + plist_head_init(&swap_avail_heads[nid]); + + swapfile_maximum_size = arch_max_swapfile_size(); + +#ifdef CONFIG_MIGRATION + if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) + swap_migration_ad_supported = true; +#endif /* CONFIG_MIGRATION */ + + return 0; +} +subsys_initcall(swapfile_init); diff --git a/mm/truncate.c b/mm/truncate.c new file mode 100644 index 000000000..0d4dd233f --- /dev/null +++ b/mm/truncate.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/truncate.c - code for taking down pages from address_spaces + * + * Copyright (C) 2002, Linus Torvalds + * + * 10Sep2002 Andrew Morton + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +/* + * Regular page slots are stabilized by the page lock even without the tree + * itself locked. These unlocked entries need verification under the tree + * lock. + */ +static inline void __clear_shadow_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + XA_STATE(xas, &mapping->i_pages, index); + + xas_set_update(&xas, workingset_update_node); + if (xas_load(&xas) != entry) + return; + xas_store(&xas, NULL); +} + +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) +{ + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + __clear_shadow_entry(mapping, index, entry); + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); +} + +/* + * Unconditionally remove exceptional entries. Usually called from truncate + * path. Note that the folio_batch may be altered by this function by removing + * exceptional entries similar to what folio_batch_remove_exceptionals() does. + */ +static void truncate_folio_batch_exceptionals(struct address_space *mapping, + struct folio_batch *fbatch, pgoff_t *indices) +{ + int i, j; + bool dax; + + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + for (j = 0; j < folio_batch_count(fbatch); j++) + if (xa_is_value(fbatch->folios[j])) + break; + + if (j == folio_batch_count(fbatch)) + return; + + dax = dax_mapping(mapping); + if (!dax) { + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + } + + for (i = j; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + pgoff_t index = indices[i]; + + if (!xa_is_value(folio)) { + fbatch->folios[j++] = folio; + continue; + } + + if (unlikely(dax)) { + dax_delete_mapping_entry(mapping, index); + continue; + } + + __clear_shadow_entry(mapping, index, folio); + } + + if (!dax) { + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + } + fbatch->nr = j; +} + +/* + * Invalidate exceptional entry if easily possible. This handles exceptional + * entries for invalidate_inode_pages(). + */ +static int invalidate_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself, or for DAX we do nothing. */ + if (shmem_mapping(mapping) || dax_mapping(mapping)) + return 1; + clear_shadow_entry(mapping, index, entry); + return 1; +} + +/* + * Invalidate exceptional entry if clean. This handles exceptional entries for + * invalidate_inode_pages2() so for DAX it evicts only clean entries. + */ +static int invalidate_exceptional_entry2(struct address_space *mapping, + pgoff_t index, void *entry) +{ + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return 1; + if (dax_mapping(mapping)) + return dax_invalidate_mapping_entry_sync(mapping, index); + clear_shadow_entry(mapping, index, entry); + return 1; +} + +/** + * folio_invalidate - Invalidate part or all of a folio. + * @folio: The folio which is affected. + * @offset: start of the range to invalidate + * @length: length of the range to invalidate + * + * folio_invalidate() is called when all or part of the folio has become + * invalidated by a truncate operation. + * + * folio_invalidate() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void folio_invalidate(struct folio *folio, size_t offset, size_t length) +{ + const struct address_space_operations *aops = folio->mapping->a_ops; + + if (aops->invalidate_folio) + aops->invalidate_folio(folio, offset, length); +} +EXPORT_SYMBOL_GPL(folio_invalidate); + +/* + * If truncate cannot remove the fs-private metadata from the page, the page + * becomes orphaned. It will be left on the LRU and may even be mapped into + * user pagetables if we're racing with filemap_fault(). + * + * We need to bail out if page->mapping is no longer equal to the original + * mapping. This happens a) when the VM reclaimed the page while we waited on + * its lock, b) when a concurrent invalidate_mapping_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +static void truncate_cleanup_folio(struct folio *folio) +{ + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + + if (folio_has_private(folio)) + folio_invalidate(folio, 0, folio_size(folio)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + folio_cancel_dirty(folio); + folio_clear_mappedtodisk(folio); +} + +int truncate_inode_folio(struct address_space *mapping, struct folio *folio) +{ + if (folio->mapping != mapping) + return -EIO; + + truncate_cleanup_folio(folio); + filemap_remove_folio(folio); + return 0; +} + +/* + * Handle partial folios. The folio may be entirely within the + * range if a split has raced with us. If not, we zero the part of the + * folio that's within the [start, end] range, and then split the folio if + * it's large. split_page_range() will discard pages which now lie beyond + * i_size, and we rely on the caller to discard pages which lie within a + * newly created hole. + * + * Returns false if splitting failed so the caller can avoid + * discarding the entire folio which is stubbornly unsplit. + */ +bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) +{ + loff_t pos = folio_pos(folio); + unsigned int offset, length; + + if (pos < start) + offset = start - pos; + else + offset = 0; + length = folio_size(folio); + if (pos + length <= (u64)end) + length = length - offset; + else + length = end + 1 - pos - offset; + + folio_wait_writeback(folio); + if (length == folio_size(folio)) { + truncate_inode_folio(folio->mapping, folio); + return true; + } + + /* + * We may be zeroing pages we're about to discard, but it avoids + * doing a complex calculation here, and then doing the zeroing + * anyway if the page split fails. + */ + folio_zero_range(folio, offset, length); + + if (folio_has_private(folio)) + folio_invalidate(folio, offset, length); + if (!folio_test_large(folio)) + return true; + if (split_folio(folio) == 0) + return true; + if (folio_test_dirty(folio)) + return false; + truncate_inode_folio(folio->mapping, folio); + return true; +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_folio(mapping, page_folio(page)); +} +EXPORT_SYMBOL(generic_error_remove_page); + +static long mapping_evict_folio(struct address_space *mapping, + struct folio *folio) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return 0; + /* The refcount will be elevated if any page in the folio is mapped */ + if (folio_ref_count(folio) > + folio_nr_pages(folio) + folio_has_private(folio) + 1) + return 0; + if (!filemap_release_folio(folio, 0)) + return 0; + + return remove_mapping(mapping, folio); +} + +/** + * invalidate_inode_page() - Remove an unused page from the pagecache. + * @page: The page to remove. + * + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. + * + * Context: Page must be locked. + * Return: The number of pages successfully removed. + */ +long invalidate_inode_page(struct page *page) +{ + struct folio *folio = page_folio(page); + struct address_space *mapping = folio_mapping(folio); + + /* The page may have been truncated before it was locked */ + if (!mapping) + return 0; + return mapping_evict_folio(mapping, folio); +} + +/** + * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * @lend: offset to which to truncate (inclusive) + * + * Truncate the page cache, removing the pages that are between + * specified offsets (and zeroing out partial pages + * if lstart or lend + 1 is not page aligned). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + * + * Note that since ->invalidate_folio() accepts range to invalidate + * truncate_inode_pages_range is able to handle cases where lend + 1 is not + * page aligned properly. + */ +void truncate_inode_pages_range(struct address_space *mapping, + loff_t lstart, loff_t lend) +{ + pgoff_t start; /* inclusive */ + pgoff_t end; /* exclusive */ + struct folio_batch fbatch; + pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t index; + int i; + struct folio *folio; + bool same_folio; + + if (mapping_empty(mapping)) + return; + + /* + * 'start' and 'end' always covers the range of pages to be fully + * truncated. Partial pages are covered with 'partial_start' at the + * start of the range and 'partial_end' at the end of the range. + * Note that 'end' is exclusive while 'lend' is inclusive. + */ + start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (lend == -1) + /* + * lend == -1 indicates end-of-file so we have to set 'end' + * to the highest possible pgoff_t and since the type is + * unsigned we're using -1. + */ + end = -1; + else + end = (lend + 1) >> PAGE_SHIFT; + + folio_batch_init(&fbatch); + index = start; + while (index < end && find_lock_entries(mapping, index, end - 1, + &fbatch, indices)) { + index = indices[folio_batch_count(&fbatch) - 1] + 1; + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + for (i = 0; i < folio_batch_count(&fbatch); i++) + truncate_cleanup_folio(fbatch.folios[i]); + delete_from_page_cache_batch(mapping, &fbatch); + for (i = 0; i < folio_batch_count(&fbatch); i++) + folio_unlock(fbatch.folios[i]); + folio_batch_release(&fbatch); + cond_resched(); + } + + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; + } + folio_unlock(folio); + folio_put(folio); + folio = NULL; + } + + if (!same_folio) + folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, + FGP_LOCK, 0); + if (folio) { + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); + } + + index = start; + while (index < end) { + cond_resched(); + if (!find_get_entries(mapping, index, end - 1, &fbatch, + indices)) { + /* If all gone from start onwards, we're done */ + if (index == start) + break; + /* Otherwise restart to make sure all gone */ + index = start; + continue; + } + + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + /* We rely upon deletion not changing page->index */ + index = indices[i]; + + if (xa_is_value(folio)) + continue; + + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + folio_wait_writeback(folio); + truncate_inode_folio(mapping, folio); + folio_unlock(folio); + index = folio_index(folio) + folio_nr_pages(folio) - 1; + } + truncate_folio_batch_exceptionals(mapping, &fbatch, indices); + folio_batch_release(&fbatch); + index++; + } +} +EXPORT_SYMBOL(truncate_inode_pages_range); + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Called under (and serialised by) inode->i_rwsem and + * mapping->invalidate_lock. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __filemap_remove_folio()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + truncate_inode_pages_range(mapping, lstart, (loff_t)-1); +} +EXPORT_SYMBOL(truncate_inode_pages); + +/** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_rwsem. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + if (!mapping_empty(mapping)) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + xa_lock_irq(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); + } + + truncate_inode_pages(mapping, 0); +} +EXPORT_SYMBOL(truncate_inode_pages_final); + +/** + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. + */ +unsigned long invalidate_mapping_pagevec(struct address_space *mapping, + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct folio_batch fbatch; + pgoff_t index = start; + unsigned long ret; + unsigned long count = 0; + int i; + + folio_batch_init(&fbatch); + while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + /* We rely upon deletion not changing folio->index */ + index = indices[i]; + + if (xa_is_value(folio)) { + count += invalidate_exceptional_entry(mapping, + index, + folio); + continue; + } + index += folio_nr_pages(folio) - 1; + + ret = mapping_evict_folio(mapping, folio); + folio_unlock(folio); + /* + * Invalidation is a hint that the folio is no longer + * of interest and try to speed up its reclaim. + */ + if (!ret) { + deactivate_file_folio(folio); + /* It is likely on the pagevec of a remote CPU */ + if (nr_pagevec) + (*nr_pagevec)++; + } + count += ret; + } + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); + cond_resched(); + index++; + } + return count; +} + +/** + * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode + * @mapping: the address_space which holds the cache to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function removes pages that are clean, unmapped and unlocked, + * as well as shadow entries. It will not block on IO activity. + * + * If you want to remove all the pages of one inode, regardless of + * their use and writeback state, use truncate_inode_pages(). + * + * Return: the number of the cache entries that were invalidated + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + return invalidate_mapping_pagevec(mapping, start, end, NULL); +} +EXPORT_SYMBOL(invalidate_mapping_pages); + +/* + * This is like invalidate_inode_page(), except it ignores the page's + * refcount. We do this because invalidate_inode_pages2() needs stronger + * invalidation guarantees, and cannot afford to leave pages behind because + * shrink_page_list() has a temp ref on them, or because they're transiently + * sitting in the lru_cache_add() pagevecs. + */ +static int invalidate_complete_folio2(struct address_space *mapping, + struct folio *folio) +{ + if (folio->mapping != mapping) + return 0; + + if (!filemap_release_folio(folio, GFP_KERNEL)) + return 0; + + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + if (folio_test_dirty(folio)) + goto failed; + + BUG_ON(folio_has_private(folio)); + __filemap_remove_folio(folio, NULL); + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + + filemap_free_folio(mapping, folio); + return 1; +failed: + xa_unlock_irq(&mapping->i_pages); + spin_unlock(&mapping->host->i_lock); + return 0; +} + +static int folio_launder(struct address_space *mapping, struct folio *folio) +{ + if (!folio_test_dirty(folio)) + return 0; + if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) + return 0; + return mapping->a_ops->launder_folio(folio); +} + +/** + * invalidate_inode_pages2_range - remove range of pages from an address_space + * @mapping: the address_space + * @start: the page offset 'from' which to invalidate + * @end: the page offset 'to' which to invalidate (inclusive) + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Return: -EBUSY if any pages could not be invalidated. + */ +int invalidate_inode_pages2_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct folio_batch fbatch; + pgoff_t index; + int i; + int ret = 0; + int ret2 = 0; + int did_range_unmap = 0; + + if (mapping_empty(mapping)) + return 0; + + folio_batch_init(&fbatch); + index = start; + while (find_get_entries(mapping, index, end, &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + /* We rely upon deletion not changing folio->index */ + index = indices[i]; + + if (xa_is_value(folio)) { + if (!invalidate_exceptional_entry2(mapping, + index, folio)) + ret = -EBUSY; + continue; + } + + if (!did_range_unmap && folio_mapped(folio)) { + /* + * If folio is mapped, before taking its lock, + * zap the rest of the file in one hit. + */ + unmap_mapping_pages(mapping, index, + (1 + end - index), false); + did_range_unmap = 1; + } + + folio_lock(folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + continue; + } + folio_wait_writeback(folio); + + if (folio_mapped(folio)) + unmap_mapping_folio(folio); + BUG_ON(folio_mapped(folio)); + + ret2 = folio_launder(mapping, folio); + if (ret2 == 0) { + if (!invalidate_complete_folio2(mapping, folio)) + ret2 = -EBUSY; + } + if (ret2 < 0) + ret = ret2; + folio_unlock(folio); + } + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); + cond_resched(); + index++; + } + /* + * For DAX we invalidate page tables after invalidating page cache. We + * could invalidate page tables while invalidating each entry however + * that would be expensive. And doing range unmapping before doesn't + * work as we have no cheap way to find whether page cache entry didn't + * get remapped later. + */ + if (dax_mapping(mapping)) { + unmap_mapping_pages(mapping, start, end - start + 1, false); + } + return ret; +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); + +/** + * invalidate_inode_pages2 - remove all pages from an address_space + * @mapping: the address_space + * + * Any pages which are found to be mapped into pagetables are unmapped prior to + * invalidation. + * + * Return: -EBUSY if any pages could not be invalidated. + */ +int invalidate_inode_pages2(struct address_space *mapping) +{ + return invalidate_inode_pages2_range(mapping, 0, -1); +} +EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @newsize: new file size + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t newsize) +{ + struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * truncate_setsize - update inode and pagecache for a new file size + * @inode: inode + * @newsize: new file size + * + * truncate_setsize updates i_size and performs pagecache truncation (if + * necessary) to @newsize. It will be typically be called from the filesystem's + * setattr function when ATTR_SIZE is passed in. + * + * Must be called with a lock serializing truncates and writes (generally + * i_rwsem but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. + */ +void truncate_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize = inode->i_size; + + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); + truncate_pagecache(inode, newsize); +} +EXPORT_SYMBOL(truncate_setsize); + +/** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_rwsem - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = i_blocksize(inode); + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) + return; + + index = from >> PAGE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + put_page(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first. Note that unmap_mapping_range + * allows holelen 0 for all, and we allow lend -1 for end of file. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); diff --git a/mm/usercopy.c b/mm/usercopy.c new file mode 100644 index 000000000..434fce112 --- /dev/null +++ b/mm/usercopy.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This implements the various checks for CONFIG_HARDENED_USERCOPY*, + * which are designed to protect kernel memory from needless exposure + * and overwrite under many unintended conditions. This code is based + * on PAX_USERCOPY, which is: + * + * Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source + * Security Inc. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "slab.h" + +/* + * Checks if a given pointer and length is contained by the current + * stack frame (if possible). + * + * Returns: + * NOT_STACK: not at all on the stack + * GOOD_FRAME: fully within a valid stack frame + * GOOD_STACK: within the current stack (when can't frame-check exactly) + * BAD_STACK: error condition (invalid stack position or bad stack frame) + */ +static noinline int check_stack_object(const void *obj, unsigned long len) +{ + const void * const stack = task_stack_page(current); + const void * const stackend = stack + THREAD_SIZE; + int ret; + + /* Object is not on the stack at all. */ + if (obj + len <= stack || stackend <= obj) + return NOT_STACK; + + /* + * Reject: object partially overlaps the stack (passing the + * check above means at least one end is within the stack, + * so if this check fails, the other end is outside the stack). + */ + if (obj < stack || stackend < obj + len) + return BAD_STACK; + + /* Check if object is safely within a valid frame. */ + ret = arch_within_stack_frames(stack, stackend, obj, len); + if (ret) + return ret; + + /* Finally, check stack depth if possible. */ +#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER + if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { + if ((void *)current_stack_pointer < obj + len) + return BAD_STACK; + } else { + if (obj < (void *)current_stack_pointer) + return BAD_STACK; + } +#endif + + return GOOD_STACK; +} + +/* + * If these functions are reached, then CONFIG_HARDENED_USERCOPY has found + * an unexpected state during a copy_from_user() or copy_to_user() call. + * There are several checks being performed on the buffer by the + * __check_object_size() function. Normal stack buffer usage should never + * trip the checks, and kernel text addressing will always trip the check. + * For cache objects, it is checking that only the whitelisted range of + * bytes for a given cache is being accessed (via the cache's usersize and + * useroffset fields). To adjust a cache whitelist, use the usercopy-aware + * kmem_cache_create_usercopy() function to create the cache (and + * carefully audit the whitelist range). + */ +void __noreturn usercopy_abort(const char *name, const char *detail, + bool to_user, unsigned long offset, + unsigned long len) +{ + pr_emerg("Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n", + to_user ? "exposure" : "overwrite", + to_user ? "from" : "to", + name ? : "unknown?!", + detail ? " '" : "", detail ? : "", detail ? "'" : "", + offset, len); + + /* + * For greater effect, it would be nice to do do_group_exit(), + * but BUG() actually hooks all the lock-breaking and per-arch + * Oops code, so that is used here instead. + */ + BUG(); +} + +/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */ +static bool overlaps(const unsigned long ptr, unsigned long n, + unsigned long low, unsigned long high) +{ + const unsigned long check_low = ptr; + unsigned long check_high = check_low + n; + + /* Does not overlap if entirely above or entirely below. */ + if (check_low >= high || check_high <= low) + return false; + + return true; +} + +/* Is this address range in the kernel text area? */ +static inline void check_kernel_text_object(const unsigned long ptr, + unsigned long n, bool to_user) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + unsigned long textlow_linear, texthigh_linear; + + if (overlaps(ptr, n, textlow, texthigh)) + usercopy_abort("kernel text", NULL, to_user, ptr - textlow, n); + + /* + * Some architectures have virtual memory mappings with a secondary + * mapping of the kernel text, i.e. there is more than one virtual + * kernel address that points to the kernel image. It is usually + * when there is a separate linear physical memory mapping, in that + * __pa() is not just the reverse of __va(). This can be detected + * and checked: + */ + textlow_linear = (unsigned long)lm_alias(textlow); + /* No different mapping: we're done. */ + if (textlow_linear == textlow) + return; + + /* Check the secondary mapping... */ + texthigh_linear = (unsigned long)lm_alias(texthigh); + if (overlaps(ptr, n, textlow_linear, texthigh_linear)) + usercopy_abort("linear kernel text", NULL, to_user, + ptr - textlow_linear, n); +} + +static inline void check_bogus_address(const unsigned long ptr, unsigned long n, + bool to_user) +{ + /* Reject if object wraps past end of memory. */ + if (ptr + (n - 1) < ptr) + usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n); + + /* Reject if NULL or ZERO-allocation. */ + if (ZERO_OR_NULL_PTR(ptr)) + usercopy_abort("null address", NULL, to_user, ptr, n); +} + +static inline void check_heap_object(const void *ptr, unsigned long n, + bool to_user) +{ + unsigned long addr = (unsigned long)ptr; + unsigned long offset; + struct folio *folio; + + if (is_kmap_addr(ptr)) { + offset = offset_in_page(ptr); + if (n > PAGE_SIZE - offset) + usercopy_abort("kmap", NULL, to_user, offset, n); + return; + } + + if (is_vmalloc_addr(ptr) && !pagefault_disabled()) { + struct vmap_area *area = find_vmap_area(addr); + + if (!area) + usercopy_abort("vmalloc", "no area", to_user, 0, n); + + if (n > area->va_end - addr) { + offset = addr - area->va_start; + usercopy_abort("vmalloc", NULL, to_user, offset, n); + } + return; + } + + if (!virt_addr_valid(ptr)) + return; + + folio = virt_to_folio(ptr); + + if (folio_test_slab(folio)) { + /* Check slab allocator for flags and size. */ + __check_heap_object(ptr, n, folio_slab(folio), to_user); + } else if (folio_test_large(folio)) { + offset = ptr - folio_address(folio); + if (n > folio_size(folio) - offset) + usercopy_abort("page alloc", NULL, to_user, offset, n); + } +} + +static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks); + +/* + * Validates that the given object is: + * - not bogus address + * - fully contained by stack (or stack frame, when available) + * - fully within SLAB object (or object whitelist area, when available) + * - not in kernel text + */ +void __check_object_size(const void *ptr, unsigned long n, bool to_user) +{ + if (static_branch_unlikely(&bypass_usercopy_checks)) + return; + + /* Skip all tests if size is zero. */ + if (!n) + return; + + /* Check for invalid addresses. */ + check_bogus_address((const unsigned long)ptr, n, to_user); + + /* Check for bad stack object. */ + switch (check_stack_object(ptr, n)) { + case NOT_STACK: + /* Object is not touching the current process stack. */ + break; + case GOOD_FRAME: + case GOOD_STACK: + /* + * Object is either in the correct frame (when it + * is possible to check) or just generally on the + * process stack (when frame checking not available). + */ + return; + default: + usercopy_abort("process stack", NULL, to_user, +#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER + IS_ENABLED(CONFIG_STACK_GROWSUP) ? + ptr - (void *)current_stack_pointer : + (void *)current_stack_pointer - ptr, +#else + 0, +#endif + n); + } + + /* Check for bad heap object. */ + check_heap_object(ptr, n, to_user); + + /* Check for object in kernel to avoid text exposure. */ + check_kernel_text_object((const unsigned long)ptr, n, to_user); +} +EXPORT_SYMBOL(__check_object_size); + +static bool enable_checks __initdata = true; + +static int __init parse_hardened_usercopy(char *str) +{ + if (strtobool(str, &enable_checks)) + pr_warn("Invalid option string for hardened_usercopy: '%s'\n", + str); + return 1; +} + +__setup("hardened_usercopy=", parse_hardened_usercopy); + +static int __init set_hardened_usercopy(void) +{ + if (enable_checks == false) + static_branch_enable(&bypass_usercopy_checks); + return 1; +} + +late_initcall(set_hardened_usercopy); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c new file mode 100644 index 000000000..650ab6cfd --- /dev/null +++ b/mm/userfaultfd.c @@ -0,0 +1,793 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mm/userfaultfd.c + * + * Copyright (C) 2015 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static __always_inline +struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + /* + * Make sure that the dst range is both valid and fully within a + * single existing vma. + */ + struct vm_area_struct *dst_vma; + + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma) + return NULL; + + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + return NULL; + + /* + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + return NULL; + + return dst_vma; +} + +/* + * Install PTEs, to map dst_addr (within dst_vma) to page. + * + * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem + * and anon, and for both shared and private VMAs. + */ +int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + bool newly_allocated, bool wp_copy) +{ + int ret; + pte_t _dst_pte, *dst_pte; + bool writable = dst_vma->vm_flags & VM_WRITE; + bool vm_shared = dst_vma->vm_flags & VM_SHARED; + bool page_in_cache = page_mapping(page); + spinlock_t *ptl; + struct inode *inode; + pgoff_t offset, max_off; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + _dst_pte = pte_mkdirty(_dst_pte); + if (page_in_cache && !vm_shared) + writable = false; + + /* + * Always mark a PTE as write-protected when needed, regardless of + * VM_WRITE, which the user might change. + */ + if (wp_copy) { + _dst_pte = pte_mkuffd_wp(_dst_pte); + writable = false; + } + + if (writable) + _dst_pte = pte_mkwrite(_dst_pte); + else + /* + * We need this to make sure write bit removed; as mk_pte() + * could return a pte with write bit set. + */ + _dst_pte = pte_wrprotect(_dst_pte); + + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + if (vma_is_shmem(dst_vma)) { + /* serialize against truncate with the page table lock */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_unlock; + } + + ret = -EEXIST; + /* + * We allow to overwrite a pte marker: consider when both MISSING|WP + * registered, we firstly wr-protect a none pte which has no page cache + * page backing it, then access the page. + */ + if (!pte_none_mostly(*dst_pte)) + goto out_unlock; + + if (page_in_cache) { + /* Usually, cache pages are already added to LRU */ + if (newly_allocated) + lru_cache_add(page); + page_add_file_rmap(page, dst_vma, false); + } else { + page_add_new_anon_rmap(page, dst_vma, dst_addr); + lru_cache_add_inactive_or_unevictable(page, dst_vma); + } + + /* + * Must happen after rmap, as mm_counter() checks mapping (via + * PageAnon()), which is set by __page_set_anon_rmap(). + */ + inc_mm_counter(dst_mm, mm_counter(page)); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + +static int mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep, + bool wp_copy) +{ + void *page_kaddr; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); + if (!page) + goto out; + + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(page_kaddr); + + /* fallback to copy_from_user outside mmap_lock */ + if (unlikely(ret)) { + ret = -ENOENT; + *pagep = page; + /* don't free the page */ + goto out; + } + + flush_dcache_page(page); + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + ret = -ENOMEM; + if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL)) + goto out_release; + + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, true, wp_copy); + if (ret) + goto out_release; +out: + return ret; +out_release: + put_page(page); + goto out; +} + +static int mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + int ret; + pgoff_t offset, max_off; + struct inode *inode; + + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (dst_vma->vm_file) { + /* the shmem MAP_PRIVATE case requires checking the i_size */ + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + ret = -EFAULT; + if (unlikely(offset >= max_off)) + goto out_unlock; + } + ret = -EEXIST; + if (!pte_none(*dst_pte)) + goto out_unlock; + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + +/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ +static int mcontinue_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + bool wp_copy) +{ + struct inode *inode = file_inode(dst_vma->vm_file); + pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; + struct page *page; + int ret; + + ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find folio */ + if (ret == -ENOENT) + ret = -EFAULT; + if (ret) + goto out; + if (!folio) { + ret = -EFAULT; + goto out; + } + + page = folio_file_page(folio, pgoff); + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + page, false, wp_copy); + if (ret) + goto out_release; + + folio_unlock(folio); + ret = 0; +out: + return ret; +out_release: + folio_unlock(folio); + folio_put(folio); + goto out; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, address); + p4d = p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud = pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is + * called with mmap_lock held, it will release mmap_lock before returning. + */ +static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + enum mcopy_atomic_mode mode, + bool wp_copy) +{ + int vm_shared = dst_vma->vm_flags & VM_SHARED; + ssize_t err; + pte_t *dst_pte; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + unsigned long vma_hpagesize; + pgoff_t idx; + u32 hash; + struct address_space *mapping; + + /* + * There is no default zero huge page for all huge page sizes as + * supported by hugetlb. A PMD_SIZE huge pages may exist as used + * by THP. Since we can not reliably insert a zero page, this + * feature is not supported. + */ + if (mode == MCOPY_ATOMIC_ZEROPAGE) { + mmap_read_unlock(dst_mm); + return -EINVAL; + } + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; + vma_hpagesize = vma_kernel_pagesize(dst_vma); + + /* + * Validate alignment based on huge page size + */ + err = -EINVAL; + if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) + goto out_unlock; + +retry: + /* + * On routine entry dst_vma is set. If we had to drop mmap_lock and + * retry, dst_vma will be set to NULL and we must lookup again. + */ + if (!dst_vma) { + err = -ENOENT; + dst_vma = find_dst_vma(dst_mm, dst_start, len); + if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) + goto out_unlock; + + err = -EINVAL; + if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) + goto out_unlock; + + vm_shared = dst_vma->vm_flags & VM_SHARED; + } + + /* + * If not shared, ensure the dst_vma has a anon_vma. + */ + err = -ENOMEM; + if (!vm_shared) { + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + } + + while (src_addr < src_start + len) { + BUG_ON(dst_addr >= dst_start + len); + + /* + * Serialize via vma_lock and hugetlb_fault_mutex. + * vma_lock ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. + */ + idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(dst_vma); + + err = -ENOMEM; + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); + if (!dst_pte) { + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + if (mode != MCOPY_ATOMIC_CONTINUE && + !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { + err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + goto out_unlock; + } + + err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, + dst_addr, src_addr, mode, &page, + wp_copy); + + hugetlb_vma_unlock_read(dst_vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + cond_resched(); + + if (unlikely(err == -ENOENT)) { + mmap_read_unlock(dst_mm); + BUG_ON(!page); + + err = copy_huge_page_from_user(page, + (const void __user *)src_addr, + vma_hpagesize / PAGE_SIZE, + true); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + mmap_read_lock(dst_mm); + + dst_vma = NULL; + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += vma_hpagesize; + src_addr += vma_hpagesize; + copied += vma_hpagesize; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + mmap_read_unlock(dst_mm); +out: + if (page) + put_page(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} +#else /* !CONFIG_HUGETLB_PAGE */ +/* fail at build time if gcc attempts to use this */ +extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, + struct vm_area_struct *dst_vma, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + enum mcopy_atomic_mode mode, + bool wp_copy); +#endif /* CONFIG_HUGETLB_PAGE */ + +static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **page, + enum mcopy_atomic_mode mode, + bool wp_copy) +{ + ssize_t err; + + if (mode == MCOPY_ATOMIC_CONTINUE) { + return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + wp_copy); + } + + /* + * The normal page fault path for a shmem will invoke the + * fault, fill the hole in the file and COW it right away. The + * result generates plain anonymous memory. So when we are + * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll + * generate anonymous memory directly without actually filling + * the hole. For the MAP_PRIVATE case the robustness check + * only happens in the pagetable (to verify it's still none) + * and not in the radix tree. + */ + if (!(dst_vma->vm_flags & VM_SHARED)) { + if (mode == MCOPY_ATOMIC_NORMAL) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, page, + wp_copy); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, + mode != MCOPY_ATOMIC_NORMAL, + wp_copy, page); + } + + return err; +} + +static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + enum mcopy_atomic_mode mcopy_mode, + atomic_t *mmap_changing, + __u64 mode) +{ + struct vm_area_struct *dst_vma; + ssize_t err; + pmd_t *dst_pmd; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + bool wp_copy; + + /* + * Sanitize the command parameters: + */ + BUG_ON(dst_start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(src_start + len <= src_start); + BUG_ON(dst_start + len <= dst_start); + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; +retry: + mmap_read_lock(dst_mm); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && atomic_read(mmap_changing)) + goto out_unlock; + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + err = -ENOENT; + dst_vma = find_dst_vma(dst_mm, dst_start, len); + if (!dst_vma) + goto out_unlock; + + err = -EINVAL; + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + wp_copy = mode & UFFDIO_COPY_MODE_WP; + if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + /* + * If this is a HUGETLB vma, pass off to appropriate routine + */ + if (is_vm_hugetlb_page(dst_vma)) + return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, + src_start, len, mcopy_mode, + wp_copy); + + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) + goto out_unlock; + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE) + goto out_unlock; + + /* + * Ensure the dst_vma has a anon_vma or this page + * would get a NULL anon_vma when moved in the + * dst_vma. + */ + err = -ENOMEM; + if (!(dst_vma->vm_flags & VM_SHARED) && + unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + + while (src_addr < src_start + len) { + pmd_t dst_pmdval; + + BUG_ON(dst_addr >= dst_start + len); + + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmd_read_atomic(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't + * override it and just be strict. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) { + err = -ENOMEM; + break; + } + /* If an huge pmd materialized from under us fail */ + if (unlikely(pmd_trans_huge(*dst_pmd))) { + err = -EFAULT; + break; + } + + BUG_ON(pmd_none(*dst_pmd)); + BUG_ON(pmd_trans_huge(*dst_pmd)); + + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, &page, mcopy_mode, wp_copy); + cond_resched(); + + if (unlikely(err == -ENOENT)) { + void *page_kaddr; + + mmap_read_unlock(dst_mm); + BUG_ON(!page); + + page_kaddr = kmap_local_page(page); + err = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap_local(page_kaddr); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + flush_dcache_page(page); + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += PAGE_SIZE; + src_addr += PAGE_SIZE; + copied += PAGE_SIZE; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + mmap_read_unlock(dst_mm); +out: + if (page) + put_page(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} + +ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len, + atomic_t *mmap_changing, __u64 mode) +{ + return __mcopy_atomic(dst_mm, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); +} + +ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing) +{ + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, + mmap_changing, 0); +} + +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing) +{ + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, + mmap_changing, 0); +} + +void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, + unsigned long start, unsigned long len, bool enable_wp) +{ + struct mmu_gather tlb; + pgprot_t newprot; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + tlb_gather_mmu(&tlb, dst_mm); + change_protection(&tlb, dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + tlb_finish_mmu(&tlb); +} + +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool enable_wp, + atomic_t *mmap_changing) +{ + struct vm_area_struct *dst_vma; + unsigned long page_mask; + int err; + + /* + * Sanitize the command parameters: + */ + BUG_ON(start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(start + len <= start); + + mmap_read_lock(dst_mm); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && atomic_read(mmap_changing)) + goto out_unlock; + + err = -ENOENT; + dst_vma = find_dst_vma(dst_mm, start, len); + + if (!dst_vma) + goto out_unlock; + if (!userfaultfd_wp(dst_vma)) + goto out_unlock; + if (!vma_can_userfault(dst_vma, dst_vma->vm_flags)) + goto out_unlock; + + if (is_vm_hugetlb_page(dst_vma)) { + err = -EINVAL; + page_mask = vma_kernel_pagesize(dst_vma) - 1; + if ((start & page_mask) || (len & page_mask)) + goto out_unlock; + } + + uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); + + err = 0; +out_unlock: + mmap_read_unlock(dst_mm); + return err; +} diff --git a/mm/util.c b/mm/util.c new file mode 100644 index 000000000..ce3bb17c9 --- /dev/null +++ b/mm/util.c @@ -0,0 +1,1195 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" +#include "swap.h" + +/** + * kfree_const - conditionally free memory + * @x: pointer to the memory + * + * Function calls kfree only if @x is not in .rodata section. + */ +void kfree_const(const void *x) +{ + if (!is_kernel_rodata((unsigned long)x)) + kfree(x); +} +EXPORT_SYMBOL(kfree_const); + +/** + * kstrdup - allocate space for and copy an existing string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s or %NULL in case of error + */ +char *kstrdup(const char *s, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strlen(s) + 1; + buf = kmalloc_track_caller(len, gfp); + if (buf) + memcpy(buf, s, len); + return buf; +} +EXPORT_SYMBOL(kstrdup); + +/** + * kstrdup_const - conditionally duplicate an existing const string + * @s: the string to duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Note: Strings allocated by kstrdup_const should be freed by kfree_const and + * must not be passed to krealloc(). + * + * Return: source string if it is in .rodata section otherwise + * fallback to kstrdup. + */ +const char *kstrdup_const(const char *s, gfp_t gfp) +{ + if (is_kernel_rodata((unsigned long)s)) + return s; + + return kstrdup(s, gfp); +} +EXPORT_SYMBOL(kstrdup_const); + +/** + * kstrndup - allocate space for and copy an existing string + * @s: the string to duplicate + * @max: read at most @max chars from @s + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Note: Use kmemdup_nul() instead if the size is known exactly. + * + * Return: newly allocated copy of @s or %NULL in case of error + */ +char *kstrndup(const char *s, size_t max, gfp_t gfp) +{ + size_t len; + char *buf; + + if (!s) + return NULL; + + len = strnlen(s, max); + buf = kmalloc_track_caller(len+1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kstrndup); + +/** + * kmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + * + * Return: newly allocated copy of @src or %NULL in case of error + */ +void *kmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = kmalloc_track_caller(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kmemdup); + +/** + * kmemdup_nul - Create a NUL-terminated string from unterminated data + * @s: The data to stringify + * @len: The size of the data + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Return: newly allocated copy of @s with NUL-termination or %NULL in + * case of error + */ +char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) +{ + char *buf; + + if (!s) + return NULL; + + buf = kmalloc_track_caller(len + 1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kmemdup_nul); + +/** + * memdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Return: an ERR_PTR() on failure. Result is physically + * contiguous, to be freed by kfree(). + */ +void *memdup_user(const void __user *src, size_t len) +{ + void *p; + + p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + + return p; +} +EXPORT_SYMBOL(memdup_user); + +/** + * vmemdup_user - duplicate memory region from user space + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Return: an ERR_PTR() on failure. Result may be not + * physically contiguous. Use kvfree() to free. + */ +void *vmemdup_user(const void __user *src, size_t len) +{ + void *p; + + p = kvmalloc(len, GFP_USER); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kvfree(p); + return ERR_PTR(-EFAULT); + } + + return p; +} +EXPORT_SYMBOL(vmemdup_user); + +/** + * strndup_user - duplicate an existing string from user space + * @s: The string to duplicate + * @n: Maximum number of bytes to copy, including the trailing NUL. + * + * Return: newly allocated copy of @s or an ERR_PTR() in case of error + */ +char *strndup_user(const char __user *s, long n) +{ + char *p; + long length; + + length = strnlen_user(s, n); + + if (!length) + return ERR_PTR(-EFAULT); + + if (length > n) + return ERR_PTR(-EINVAL); + + p = memdup_user(s, length); + + if (IS_ERR(p)) + return p; + + p[length - 1] = '\0'; + + return p; +} +EXPORT_SYMBOL(strndup_user); + +/** + * memdup_user_nul - duplicate memory region from user space and NUL-terminate + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Return: an ERR_PTR() on failure. + */ +void *memdup_user_nul(const void __user *src, size_t len) +{ + char *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len + 1, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + p[len] = '\0'; + + return p; +} +EXPORT_SYMBOL(memdup_user_nul); + +/* Check if the vma is being used as a stack by this task */ +int vma_is_stack_for_current(struct vm_area_struct *vma) +{ + struct task_struct * __maybe_unused t = current; + + return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); +} + +/* + * Change backing file, only valid to use during initial VMA setup. + */ +void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} +EXPORT_SYMBOL(vma_set_file); + +#ifndef STACK_RND_MASK +#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ +#endif + +unsigned long randomize_stack_top(unsigned long stack_top) +{ + unsigned long random_variable = 0; + + if (current->flags & PF_RANDOMIZE) { + random_variable = get_random_long(); + random_variable &= STACK_RND_MASK; + random_variable <<= PAGE_SHIFT; + } +#ifdef CONFIG_STACK_GROWSUP + return PAGE_ALIGN(stack_top) + random_variable; +#else + return PAGE_ALIGN(stack_top) - random_variable; +#endif +} + +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); +} + +#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +unsigned long __weak arch_randomize_brk(struct mm_struct *mm) +{ + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) + return randomize_page(mm->brk, SZ_32M); + + return randomize_page(mm->brk, SZ_1G); +} + +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + if (is_compat_task()) + rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); + else +#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ + rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + + return rnd << PAGE_SHIFT; +} + +static int mmap_is_legacy(struct rlimit *rlim_stack) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlim_stack->rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +/* + * Leave enough space between the mmap area and the stack to honour ulimit in + * the face of randomisation. + */ +#define MIN_GAP (SZ_128M) +#define MAX_GAP (STACK_TOP / 6 * 5) + +static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +{ + unsigned long gap = rlim_stack->rlim_cur; + unsigned long pad = stack_guard_gap; + + /* Account for stack randomization if necessary */ + if (current->flags & PF_RANDOMIZE) + pad += (STACK_RND_MASK << PAGE_SHIFT); + + /* Values close to RLIM_INFINITY can overflow. */ + if (gap + pad > gap) + gap += pad; + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(STACK_TOP - gap - rnd); +} + +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + if (mmap_is_legacy(rlim_stack)) { + mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor, rlim_stack); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} +#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; +} +#endif + +/** + * __account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * @task: task used to check RLIMIT_MEMLOCK + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped + * + * Assumes @task and @mm are valid (i.e. at least one reference on each), and + * that mmap_lock is held as writer. + * + * Return: + * * 0 on success + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + struct task_struct *task, bool bypass_rlim) +{ + unsigned long locked_vm, limit; + int ret = 0; + + mmap_assert_write_locked(mm); + + locked_vm = mm->locked_vm; + if (inc) { + if (!bypass_rlim) { + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked_vm + pages > limit) + ret = -ENOMEM; + } + if (!ret) + mm->locked_vm = locked_vm + pages; + } else { + WARN_ON_ONCE(pages > locked_vm); + mm->locked_vm = locked_vm - pages; + } + + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, + (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + return ret; +} +EXPORT_SYMBOL_GPL(__account_locked_vm); + +/** + * account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against, may be NULL + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * + * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). + * + * Return: + * * 0 on success, or if mm is NULL + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) +{ + int ret; + + if (pages == 0 || !mm) + return 0; + + mmap_write_lock(mm); + ret = __account_locked_vm(mm, pages, inc, current, + capable(CAP_IPC_LOCK)); + mmap_write_unlock(mm); + + return ret; +} +EXPORT_SYMBOL_GPL(account_locked_vm); + +unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff) +{ + unsigned long ret; + struct mm_struct *mm = current->mm; + unsigned long populate; + LIST_HEAD(uf); + + ret = security_mmap_file(file, prot, flag); + if (!ret) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, + &uf); + mmap_write_unlock(mm); + userfaultfd_unmap_complete(mm, &uf); + if (populate) + mm_populate(ret, populate); + } + return ret; +} + +unsigned long vm_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + if (unlikely(offset + PAGE_ALIGN(len) < offset)) + return -EINVAL; + if (unlikely(offset_in_page(offset))) + return -EINVAL; + + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +} +EXPORT_SYMBOL(vm_mmap); + +/** + * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *kvmalloc_node(size_t size, gfp_t flags, int node) +{ + gfp_t kmalloc_flags = flags; + void *ret; + + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + kmalloc_flags |= __GFP_NOWARN; + + if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) + kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; + } + + ret = kmalloc_node(size, kmalloc_flags, node); + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(kvmalloc_node); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +{ + void *newp; + + if (oldsize >= newsize) + return (void *)p; + newp = kvmalloc(newsize, flags); + if (!newp) + return NULL; + memcpy(newp, p, oldsize); + kvfree(p); + return newp; +} +EXPORT_SYMBOL(kvrealloc); + +/** + * __vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + return __vmalloc(bytes, flags); +} +EXPORT_SYMBOL(__vmalloc_array); + +/** + * vmalloc_array - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vmalloc_array(size_t n, size_t size) +{ + return __vmalloc_array(n, size, GFP_KERNEL); +} +EXPORT_SYMBOL(vmalloc_array); + +/** + * __vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +void *__vcalloc(size_t n, size_t size, gfp_t flags) +{ + return __vmalloc_array(n, size, flags | __GFP_ZERO); +} +EXPORT_SYMBOL(__vcalloc); + +/** + * vcalloc - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +void *vcalloc(size_t n, size_t size) +{ + return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); +} +EXPORT_SYMBOL(vcalloc); + +/* Neutral page->mapping pointer to address_space or anon_vma or other */ +void *page_rmapping(struct page *page) +{ + return folio_raw_mapping(page_folio(page)); +} + +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. + */ +bool folio_mapped(struct folio *folio) +{ + long i, nr; + + if (!folio_test_large(folio)) + return atomic_read(&folio->_mapcount) >= 0; + if (atomic_read(folio_mapcount_ptr(folio)) >= 0) + return true; + if (folio_test_hugetlb(folio)) + return false; + + nr = folio_nr_pages(folio); + for (i = 0; i < nr; i++) { + if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) + return true; + } + return false; +} +EXPORT_SYMBOL(folio_mapped); + +struct anon_vma *folio_anon_vma(struct folio *folio) +{ + unsigned long mapping = (unsigned long)folio->mapping; + + if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + return NULL; + return (void *)(mapping - PAGE_MAPPING_ANON); +} + +/** + * folio_mapping - Find the mapping where this folio is stored. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Folios in the swap cache return the swap mapping + * this page is stored in (which is different from the mapping for the + * swap file or swap device where the data is stored). + * + * You can call this for folios which aren't in the swap cache or page + * cache and it will return NULL. + */ +struct address_space *folio_mapping(struct folio *folio) +{ + struct address_space *mapping; + + /* This happens if someone calls flush_dcache_page on slab page */ + if (unlikely(folio_test_slab(folio))) + return NULL; + + if (unlikely(folio_test_swapcache(folio))) + return swap_address_space(folio_swap_entry(folio)); + + mapping = folio->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + return NULL; + + return mapping; +} +EXPORT_SYMBOL(folio_mapping); + +/* Slow path of page_mapcount() for compound pages */ +int __page_mapcount(struct page *page) +{ + int ret; + + ret = atomic_read(&page->_mapcount) + 1; + /* + * For file THP page->_mapcount contains total number of mapping + * of the page: no need to look into compound_mapcount. + */ + if (!PageAnon(page) && !PageHuge(page)) + return ret; + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + return ret; +} +EXPORT_SYMBOL_GPL(__page_mapcount); + +/** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +int folio_mapcount(struct folio *folio) +{ + int i, compound, nr, ret; + + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + + compound = folio_entire_mapcount(folio); + if (folio_test_hugetlb(folio)) + return compound; + ret = compound; + nr = folio_nr_pages(folio); + for (i = 0; i < nr; i++) + ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; + /* File pages has compound_mapcount included in _mapcount */ + if (!folio_test_anon(folio)) + return ret - compound * nr; + if (folio_test_double_map(folio)) + ret -= nr; + return ret; +} + +/** + * folio_copy - Copy the contents of one folio to another. + * @dst: Folio to copy to. + * @src: Folio to copy from. + * + * The bytes in the folio represented by @src are copied to @dst. + * Assumes the caller has validated that @dst is at least as large as @src. + * Can be called in atomic context for order-0 folios, but if the folio is + * larger, it may sleep. + */ +void folio_copy(struct folio *dst, struct folio *src) +{ + long i = 0; + long nr = folio_nr_pages(src); + + for (;;) { + copy_highpage(folio_page(dst, i), folio_page(src, i)); + if (++i == nr) + break; + cond_resched(); + } +} + +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; +int sysctl_overcommit_ratio __read_mostly = 50; +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ + +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_kbytes = 0; + return ret; +} + +static void sync_overcommit_as(struct work_struct *dummy) +{ + percpu_counter_sync(&vm_committed_as); +} + +int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int new_policy = -1; + int ret; + + /* + * The deviation of sync_overcommit_as could be big with loose policy + * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to + * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply + * with the strict "NEVER", and to avoid possible race condition (even + * though user usually won't too frequently do the switching to policy + * OVERCOMMIT_NEVER), the switch is done in the following order: + * 1. changing the batch + * 2. sync percpu count on each CPU + * 3. switch the policy + */ + if (write) { + t = *table; + t.data = &new_policy; + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret || new_policy == -1) + return ret; + + mm_compute_batch(new_policy); + if (new_policy == OVERCOMMIT_NEVER) + schedule_on_each_cpu(sync_overcommit_as); + sysctl_overcommit_memory = new_policy; + } else { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } + + return ret; +} + +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + sysctl_overcommit_ratio = 0; + return ret; +} + +/* + * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used + */ +unsigned long vm_commit_limit(void) +{ + unsigned long allowed; + + if (sysctl_overcommit_kbytes) + allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10); + else + allowed = ((totalram_pages() - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100); + allowed += total_swap_pages; + + return allowed; +} + +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + * + * The time cost of this is very low for small platforms, and for big + * platform like a 2S/36C/72T Skylake server, in worst case where + * vm_committed_as's spinlock is under severe contention, the time cost + * could be about 30~40 microseconds. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_sum_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long allowed; + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + if (pages > totalram_pages() + total_swap_pages) + goto error; + return 0; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n", + __func__, current->pid, current->comm); + vm_unacct_memory(pages); + + return -ENOMEM; +} + +/** + * get_cmdline() - copy the cmdline value to a buffer. + * @task: the task whose cmdline value to copy. + * @buffer: the buffer to copy to. + * @buflen: the length of the buffer. Larger cmdline values are truncated + * to this length. + * + * Return: the size of the cmdline field copied. Note that the copy does + * not guarantee an ending NULL byte. + */ +int get_cmdline(struct task_struct *task, char *buffer, int buflen) +{ + int res = 0; + unsigned int len; + struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; + if (!mm) + goto out; + if (!mm->arg_end) + goto out_mm; /* Shh! No looking before we're done */ + + spin_lock(&mm->arg_lock); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + spin_unlock(&mm->arg_lock); + + len = arg_end - arg_start; + + if (len > buflen) + len = buflen; + + res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE); + + /* + * If the nul at the end of args has been overwritten, then + * assume application is using setproctitle(3). + */ + if (res > 0 && buffer[res-1] != '\0' && len < buflen) { + len = strnlen(buffer, res); + if (len < res) { + res = len; + } else { + len = env_end - env_start; + if (len > buflen - res) + len = buflen - res; + res += access_process_vm(task, env_start, + buffer+res, len, + FOLL_FORCE); + res = strnlen(buffer, res); + } + } +out_mm: + mmput(mm); +out: + return res; +} + +int __weak memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1); + addr2 = kmap_atomic(page2); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2); + kunmap_atomic(addr1); + return ret; +} + +#ifdef CONFIG_PRINTK +/** + * mem_dump_obj - Print available provenance information + * @object: object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For example, for a slab-cache object, the slab name is printed, and, + * if available, the return address and stack trace from the allocation + * and last free path of that object. + */ +void mem_dump_obj(void *object) +{ + const char *type; + + if (kmem_valid_obj(object)) { + kmem_dump_obj(object); + return; + } + + if (vmalloc_dump_obj(object)) + return; + + if (is_vmalloc_addr(object)) + type = "vmalloc memory"; + else if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); +} +EXPORT_SYMBOL_GPL(mem_dump_obj); +#endif + +/* + * A driver might set a page logically offline -- PageOffline() -- and + * turn the page inaccessible in the hypervisor; after that, access to page + * content can be fatal. + * + * Some special PFN walkers -- i.e., /proc/kcore -- read content of random + * pages after checking PageOffline(); however, these PFN walkers can race + * with drivers that set PageOffline(). + * + * page_offline_freeze()/page_offline_thaw() allows for a subsystem to + * synchronize with such drivers, achieving that a page cannot be set + * PageOffline() while frozen. + * + * page_offline_begin()/page_offline_end() is used by drivers that care about + * such races when setting a page PageOffline(). + */ +static DECLARE_RWSEM(page_offline_rwsem); + +void page_offline_freeze(void) +{ + down_read(&page_offline_rwsem); +} + +void page_offline_thaw(void) +{ + up_read(&page_offline_rwsem); +} + +void page_offline_begin(void) +{ + down_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_begin); + +void page_offline_end(void) +{ + up_write(&page_offline_rwsem); +} +EXPORT_SYMBOL(page_offline_end); + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +void flush_dcache_folio(struct folio *folio) +{ + long i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + flush_dcache_page(folio_page(folio, i)); +} +EXPORT_SYMBOL(flush_dcache_folio); +#endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c new file mode 100644 index 000000000..67a10a04d --- /dev/null +++ b/mm/vmalloc.c @@ -0,0 +1,4218 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1993 Linus Torvalds + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 + * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 + * Numa awareness, Christoph Lameter, SGI, June 2005 + * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" +#include "pgalloc-track.h" + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; + +static int __init set_nohugeiomap(char *str) +{ + ioremap_max_page_shift = PAGE_SHIFT; + return 0; +} +early_param("nohugeiomap", set_nohugeiomap); +#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ +static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ + +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + +bool is_vmalloc_addr(const void *x) +{ + unsigned long addr = (unsigned long)kasan_reset_tag(x); + + return addr >= VMALLOC_START && addr < VMALLOC_END; +} +EXPORT_SYMBOL(is_vmalloc_addr); + +struct vfree_deferred { + struct llist_head list; + struct work_struct wq; +}; +static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); + +static void __vunmap(const void *, int); + +static void free_work(struct work_struct *w) +{ + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); +} + +/*** Page table manipulation functions ***/ +static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pte_t *pte; + u64 pfn; + unsigned long size = PAGE_SIZE; + + pfn = phys_addr >> PAGE_SHIFT; + pte = pte_alloc_kernel_track(pmd, addr, mask); + if (!pte) + return -ENOMEM; + do { + BUG_ON(!pte_none(*pte)); + +#ifdef CONFIG_HUGETLB_PAGE + size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); + if (size != PAGE_SIZE) { + pte_t entry = pfn_pte(pfn, prot); + + entry = arch_make_huge_pte(entry, ilog2(size), 0); + set_huge_pte_at(&init_mm, addr, pte, entry); + pfn += PFN_DOWN(size); + continue; + } +#endif + set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); + pfn++; + } while (pte += PFN_DOWN(size), addr += size, addr != end); + *mask |= PGTBL_PTE_MODIFIED; + return 0; +} + +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PMD_SHIFT) + return 0; + + if (!arch_vmap_pmd_supported(prot)) + return 0; + + if ((end - addr) != PMD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PMD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PMD_SIZE)) + return 0; + + if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) + return 0; + + return pmd_set_huge(pmd, phys_addr, prot); +} + +static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + + if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PMD_MODIFIED; + continue; + } + + if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) + return -ENOMEM; + } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < PUD_SHIFT) + return 0; + + if (!arch_vmap_pud_supported(prot)) + return 0; + + if ((end - addr) != PUD_SIZE) + return 0; + + if (!IS_ALIGNED(addr, PUD_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, PUD_SIZE)) + return 0; + + if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) + return 0; + + return pud_set_huge(pud, phys_addr, prot); +} + +static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc_track(&init_mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + + if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_PUD_MODIFIED; + continue; + } + + if (vmap_pmd_range(pud, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (pud++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + if (max_page_shift < P4D_SHIFT) + return 0; + + if (!arch_vmap_p4d_supported(prot)) + return 0; + + if ((end - addr) != P4D_SIZE) + return 0; + + if (!IS_ALIGNED(addr, P4D_SIZE)) + return 0; + + if (!IS_ALIGNED(phys_addr, P4D_SIZE)) + return 0; + + if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) + return 0; + + return p4d_set_huge(p4d, phys_addr, prot); +} + +static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift, pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + + if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, + max_page_shift)) { + *mask |= PGTBL_P4D_MODIFIED; + continue; + } + + if (vmap_pud_range(p4d, addr, next, phys_addr, prot, + max_page_shift, mask)) + return -ENOMEM; + } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); + return 0; +} + +static int vmap_range_noflush(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + +int ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot) +{ + int err; + + err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), + ioremap_max_page_shift); + flush_cache_vmap(addr, end); + if (!err) + err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); + return err; +} + +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) +{ + pte_t *pte; + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); + WARN_ON(!pte_none(ptent) && !pte_present(ptent)); + } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; +} + +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + int cleared; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) + continue; + if (pmd_none_or_clear_bad(pmd)) + continue; + vunmap_pte_range(pmd, addr, next, mask); + + cond_resched(); + } while (pmd++, addr = next, addr != end); +} + +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + int cleared; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) + continue; + if (pud_none_or_clear_bad(pud)) + continue; + vunmap_pmd_range(pud, addr, next, mask); + } while (pud++, addr = next, addr != end); +} + +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + p4d_clear_huge(p4d); + if (p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (p4d_none_or_clear_bad(p4d)) + continue; + vunmap_pud_range(p4d, addr, next, mask); + } while (p4d++, addr = next, addr != end); +} + +/* + * vunmap_range_noflush is similar to vunmap_range, but does not + * flush caches or TLBs. + * + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced). + * + * This is an internal function only. Do not use outside mm/. + */ +void __vunmap_range_noflush(unsigned long start, unsigned long end) +{ + unsigned long next; + pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + if (pgd_none_or_clear_bad(pgd)) + continue; + vunmap_p4d_range(pgd, addr, next, &mask); + } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); +} + +void vunmap_range_noflush(unsigned long start, unsigned long end) +{ + kmsan_vunmap_range_noflush(start, end); + __vunmap_range_noflush(start, end); +} + +/** + * vunmap_range - unmap kernel virtual addresses + * @addr: start of the VM area to unmap + * @end: end of the VM area to unmap (non-inclusive) + * + * Clears any present PTEs in the virtual address range, flushes TLBs and + * caches. Any subsequent access to the address before it has been re-mapped + * is a kernel bug. + */ +void vunmap_range(unsigned long addr, unsigned long end) +{ + flush_cache_vunmap(addr, end); + vunmap_range_noflush(addr, end); + flush_tlb_kernel_range(addr, end); +} + +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) +{ + pte_t *pte; + + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + + pte = pte_alloc_kernel_track(pmd, addr, mask); + if (!pte) + return -ENOMEM; + do { + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) + return -ENOMEM; + if (WARN_ON(!pfn_valid(page_to_pfn(page)))) + return -EINVAL; + + set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); + (*nr)++; + } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; + return 0; +} + +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) + return -ENOMEM; + } while (pmd++, addr = next, addr != end); + return 0; +} + +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) +{ + pud_t *pud; + unsigned long next; + + pud = pud_alloc_track(&init_mm, p4d, addr, mask); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) + return -ENOMEM; + } while (pud++, addr = next, addr != end); + return 0; +} + +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); + if (!p4d) + return -ENOMEM; + do { + next = p4d_addr_end(addr, end); + if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) + return -ENOMEM; + } while (p4d++, addr = next, addr != end); + return 0; +} + +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) +{ + unsigned long start = addr; + pgd_t *pgd; + unsigned long next; + int err = 0; + int nr = 0; + pgtbl_mod_mask mask = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + if (err) + return err; + } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return 0; +} + +/* + * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || + page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + page_to_phys(pages[i]), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, + page_shift); + + if (ret) + return ret; + return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +} + +/** + * vmap_pages_range - map pages to a kernel virtual address + * @addr: start of the VM area to map + * @end: end of the VM area to map (non-inclusive) + * @prot: page protection flags to use + * @pages: pages to map (always PAGE_SIZE pages) + * @page_shift: maximum shift that the pages may be mapped with, @pages must + * be aligned and contiguous up to at least this shift. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + +int is_vmalloc_or_module_addr(const void *x) +{ + /* + * ARM, x86-64 and sparc64 put modules in a special place, + * and fall back on vmalloc() if that fails. Others + * just put it in the vmalloc space. + */ +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + unsigned long addr = (unsigned long)kasan_reset_tag(x); + if (addr >= MODULES_VADDR && addr < MODULES_END) + return 1; +#endif + return is_vmalloc_addr(x); +} + +/* + * Walk a vmap address to the struct page it maps. Huge vmap mappings will + * return the tail page that corresponds to the base page address, which + * matches small vmap mappings. + */ +struct page *vmalloc_to_page(const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ + VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); + + if (pgd_none(*pgd)) + return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + + return page; +} +EXPORT_SYMBOL(vmalloc_to_page); + +/* + * Map a vmalloc()-space virtual address to the physical page frame number. + */ +unsigned long vmalloc_to_pfn(const void *vmalloc_addr) +{ + return page_to_pfn(vmalloc_to_page(vmalloc_addr)); +} +EXPORT_SYMBOL(vmalloc_to_pfn); + + +/*** Global kva allocator ***/ + +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 + + +static DEFINE_SPINLOCK(vmap_area_lock); +static DEFINE_SPINLOCK(free_vmap_area_lock); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); +static struct rb_root vmap_area_root = RB_ROOT; +static bool vmap_initialized __read_mostly; + +static struct rb_root purge_vmap_area_root = RB_ROOT; +static LIST_HEAD(purge_vmap_area_list); +static DEFINE_SPINLOCK(purge_vmap_area_lock); + +/* + * This kmem_cache is used for vmap_area objects. Instead of + * allocating from slab we reuse an object from this cache to + * make things faster. Especially in "no edge" splitting of + * free block. + */ +static struct kmem_cache *vmap_area_cachep; + +/* + * This linked list is used in pair with free_vmap_area_root. + * It gives O(1) access to prev/next to perform fast coalescing. + */ +static LIST_HEAD(free_vmap_area_list); + +/* + * This augment red-black tree represents the free vmap space. + * All vmap_area objects in this tree are sorted by va->va_start + * address. It is used for allocation and merging when a vmap + * object is released. + * + * Each vmap_area node contains a maximum available free block + * of its sub-tree, right or left. Therefore it is possible to + * find a lowest match of free area. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; + +/* + * Preload a CPU with one object for "no edge" split case. The + * aim is to get rid of allocations from the atomic context, thus + * to use more permissive allocation masks. + */ +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); + +static __always_inline unsigned long +va_size(struct vmap_area *va) +{ + return (va->va_end - va->va_start); +} + +static __always_inline unsigned long +get_subtree_max_size(struct rb_node *node) +{ + struct vmap_area *va; + + va = rb_entry_safe(node, struct vmap_area, rb_node); + return va ? va->subtree_max_size : 0; +} + +RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) + +static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static void drain_vmap_area_work(struct work_struct *work); +static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); + +static atomic_long_t nr_vmalloc_pages; + +unsigned long vmalloc_nr_pages(void) +{ + return atomic_long_read(&nr_vmalloc_pages); +} + +/* Look up the first VA which satisfies addr < va_end, NULL if none. */ +static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +{ + struct vmap_area *va = NULL; + struct rb_node *n = vmap_area_root.rb_node; + + addr = (unsigned long)kasan_reset_tag((void *)addr); + + while (n) { + struct vmap_area *tmp; + + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end > addr) { + va = tmp; + if (tmp->va_start <= addr) + break; + + n = n->rb_left; + } else + n = n->rb_right; + } + + return va; +} + +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) +{ + struct rb_node *n = root->rb_node; + + addr = (unsigned long)kasan_reset_tag((void *)addr); + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr >= va->va_end) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +/* + * This function returns back addresses of parent node + * and its left or right link for further processing. + * + * Otherwise NULL is returned. In that case all further + * steps regarding inserting of conflicting overlap range + * have to be declined and actually considered as a bug. + */ +static __always_inline struct rb_node ** +find_va_links(struct vmap_area *va, + struct rb_root *root, struct rb_node *from, + struct rb_node **parent) +{ + struct vmap_area *tmp_va; + struct rb_node **link; + + if (root) { + link = &root->rb_node; + if (unlikely(!*link)) { + *parent = NULL; + return link; + } + } else { + link = &from; + } + + /* + * Go to the bottom of the tree. When we hit the last point + * we end up with parent rb_node and correct direction, i name + * it link, where the new va->rb_node will be attached to. + */ + do { + tmp_va = rb_entry(*link, struct vmap_area, rb_node); + + /* + * During the traversal we also do some sanity check. + * Trigger the BUG() if there are sides(left/right) + * or full overlaps. + */ + if (va->va_end <= tmp_va->va_start) + link = &(*link)->rb_left; + else if (va->va_start >= tmp_va->va_end) + link = &(*link)->rb_right; + else { + WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", + va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); + + return NULL; + } + } while (*link); + + *parent = &tmp_va->rb_node; + return link; +} + +static __always_inline struct list_head * +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) +{ + struct list_head *list; + + if (unlikely(!parent)) + /* + * The red-black tree where we try to find VA neighbors + * before merging or inserting is empty, i.e. it means + * there is no free vmap space. Normally it does not + * happen but we handle this case anyway. + */ + return NULL; + + list = &rb_entry(parent, struct vmap_area, rb_node)->list; + return (&parent->rb_right == link ? list->next : list); +} + +static __always_inline void +__link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head, bool augment) +{ + /* + * VA is still not in the list, but we can + * identify its future previous list_head node. + */ + if (likely(parent)) { + head = &rb_entry(parent, struct vmap_area, rb_node)->list; + if (&parent->rb_right != link) + head = head->prev; + } + + /* Insert to the rb-tree */ + rb_link_node(&va->rb_node, parent, link); + if (augment) { + /* + * Some explanation here. Just perform simple insertion + * to the tree. We do not set va->subtree_max_size to + * its current size before calling rb_insert_augmented(). + * It is because we populate the tree from the bottom + * to parent levels when the node _is_ in the tree. + * + * Therefore we set subtree_max_size to zero after insertion, + * to let __augment_tree_propagate_from() puts everything to + * the correct order later on. + */ + rb_insert_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + va->subtree_max_size = 0; + } else { + rb_insert_color(&va->rb_node, root); + } + + /* Address-sort this list */ + list_add(&va->list, head); +} + +static __always_inline void +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head) +{ + __link_va(va, root, parent, link, head, false); +} + +static __always_inline void +link_va_augment(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head) +{ + __link_va(va, root, parent, link, head, true); +} + +static __always_inline void +__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) +{ + if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) + return; + + if (augment) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); + + list_del_init(&va->list); + RB_CLEAR_NODE(&va->rb_node); +} + +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + __unlink_va(va, root, false); +} + +static __always_inline void +unlink_va_augment(struct vmap_area *va, struct rb_root *root) +{ + __unlink_va(va, root, true); +} + +#if DEBUG_AUGMENT_PROPAGATE_CHECK +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} + +static void +augment_tree_propagate_check(void) +{ + struct vmap_area *va; + unsigned long computed_size; + + list_for_each_entry(va, &free_vmap_area_list, list) { + computed_size = compute_subtree_max_size(va); + if (computed_size != va->subtree_max_size) + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } +} +#endif + +/* + * This function populates subtree_max_size from bottom to upper + * levels starting from VA point. The propagation must be done + * when VA size is modified by changing its va_start/va_end. Or + * in case of newly inserting of VA to the tree. + * + * It means that __augment_tree_propagate_from() must be called: + * - After VA has been inserted to the tree(free path); + * - After VA has been shrunk(allocation path); + * - After VA has been increased(merging path). + * + * Please note that, it does not mean that upper parent nodes + * and their subtree_max_size are recalculated all the time up + * to the root node. + * + * 4--8 + * /\ + * / \ + * / \ + * 2--2 8--8 + * + * For example if we modify the node 4, shrinking it to 2, then + * no any modification is required. If we shrink the node 2 to 1 + * its subtree_max_size is updated only, and set to 1. If we shrink + * the node 8 to 6, then its subtree_max_size is set to 6 and parent + * node becomes 4--6. + */ +static __always_inline void +augment_tree_propagate_from(struct vmap_area *va) +{ + /* + * Populate the tree from bottom towards the root until + * the calculated maximum available size of checked node + * is equal to its current one. + */ + free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(); +#endif +} + +static void +insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + link = find_va_links(va, root, NULL, &parent); + if (link) + link_va(va, root, parent, link, head); +} + +static void +insert_vmap_area_augment(struct vmap_area *va, + struct rb_node *from, struct rb_root *root, + struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + if (from) + link = find_va_links(va, NULL, from, &parent); + else + link = find_va_links(va, root, NULL, &parent); + + if (link) { + link_va_augment(va, root, parent, link, head); + augment_tree_propagate_from(va); + } +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. If coalesce is not done a new + * free area is inserted. If VA has been merged, it is + * freed. + * + * Please note, it can return NULL in case of overlap + * ranges, followed by WARN() report. Despite it is a + * buggy behaviour, a system can be alive and keep + * ongoing. + */ +static __always_inline struct vmap_area * +__merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head, bool augment) +{ + struct vmap_area *sibling; + struct list_head *next; + struct rb_node **link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + link = find_va_links(va, root, NULL, &parent); + if (!link) + return NULL; + + /* + * Get next node of VA to check if merging can be done. + */ + next = get_va_next_sibling(parent, link); + if (unlikely(next == NULL)) + goto insert; + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (next->prev != head) { + sibling = list_entry(next->prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + /* + * If both neighbors are coalesced, it is important + * to unlink the "next" node first, followed by merging + * with "previous" one. Otherwise the tree might not be + * fully populated if a sibling's augmented value is + * "normalized" because of rotation operations. + */ + if (merged) + __unlink_va(va, root, augment); + + sibling->va_end = va->va_end; + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + +insert: + if (!merged) + __link_va(va, root, parent, link, head, augment); + + return va; +} + +static __always_inline struct vmap_area * +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + return __merge_or_add_vmap_area(va, root, head, false); +} + +static __always_inline struct vmap_area * +merge_or_add_vmap_area_augment(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + va = __merge_or_add_vmap_area(va, root, head, true); + if (va) + augment_tree_propagate_from(va); + + return va; +} + +static __always_inline bool +is_within_this_va(struct vmap_area *va, unsigned long size, + unsigned long align, unsigned long vstart) +{ + unsigned long nva_start_addr; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Can be overflowed due to big size or alignment. */ + if (nva_start_addr + size < nva_start_addr || + nva_start_addr < vstart) + return false; + + return (nva_start_addr + size <= va->va_end); +} + +/* + * Find the first free block(lowest start address) in the tree, + * that will accomplish the request corresponding to passing + * parameters. Please note, with an alignment bigger than PAGE_SIZE, + * a search length is adjusted to account for worst case alignment + * overhead. + */ +static __always_inline struct vmap_area * +find_vmap_lowest_match(struct rb_root *root, unsigned long size, + unsigned long align, unsigned long vstart, bool adjust_search_size) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long length; + + /* Start from the root. */ + node = root->rb_node; + + /* Adjust the search size for alignment overhead. */ + length = adjust_search_size ? size + align - 1 : size; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) >= length && + vstart < va->va_start) { + node = node->rb_left; + } else { + if (is_within_this_va(va, size, align, vstart)) + return va; + + /* + * Does not make sense to go deeper towards the right + * sub-tree if it does not have a free block that is + * equal or bigger to the requested search length. + */ + if (get_subtree_max_size(node->rb_right) >= length) { + node = node->rb_right; + continue; + } + + /* + * OK. We roll back and find the first right sub-tree, + * that will satisfy the search criteria. It can happen + * due to "vstart" restriction or an alignment overhead + * that is bigger then PAGE_SIZE. + */ + while ((node = rb_parent(node))) { + va = rb_entry(node, struct vmap_area, rb_node); + if (is_within_this_va(va, size, align, vstart)) + return va; + + if (get_subtree_max_size(node->rb_right) >= length && + vstart <= va->va_start) { + /* + * Shift the vstart forward. Please note, we update it with + * parent's start address adding "1" because we do not want + * to enter same sub-tree after it has already been checked + * and no suitable free block found there. + */ + vstart = va->va_start + 1; + node = node->rb_right; + break; + } + } + } + } + + return NULL; +} + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK +#include + +static struct vmap_area * +find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + + list_for_each_entry(va, head, list) { + if (!is_within_this_va(va, size, align, vstart)) + continue; + + return va; + } + + return NULL; +} + +static void +find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align) +{ + struct vmap_area *va_1, *va_2; + unsigned long vstart; + unsigned int rnd; + + get_random_bytes(&rnd, sizeof(rnd)); + vstart = VMALLOC_START + rnd; + + va_1 = find_vmap_lowest_match(root, size, align, vstart, false); + va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); + + if (va_1 != va_2) + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", + va_1, va_2, vstart); +} +#endif + +enum fit_type { + NOTHING_FIT = 0, + FL_FIT_TYPE = 1, /* full fit */ + LE_FIT_TYPE = 2, /* left edge fit */ + RE_FIT_TYPE = 3, /* right edge fit */ + NE_FIT_TYPE = 4 /* no edge fit */ +}; + +static __always_inline enum fit_type +classify_va_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size) +{ + enum fit_type type; + + /* Check if it is within VA. */ + if (nva_start_addr < va->va_start || + nva_start_addr + size > va->va_end) + return NOTHING_FIT; + + /* Now classify. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + type = FL_FIT_TYPE; + else + type = LE_FIT_TYPE; + } else if (va->va_end == nva_start_addr + size) { + type = RE_FIT_TYPE; + } else { + type = NE_FIT_TYPE; + } + + return type; +} + +static __always_inline int +adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, + unsigned long size) +{ + struct vmap_area *lva = NULL; + enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); + + if (type == FL_FIT_TYPE) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + unlink_va_augment(va, root); + kmem_cache_free(vmap_area_cachep, va); + } else if (type == LE_FIT_TYPE) { + /* + * Split left edge of fit VA. + * + * | | + * V NVA V R + * |-------|-------| + */ + va->va_start += size; + } else if (type == RE_FIT_TYPE) { + /* + * Split right edge of fit VA. + * + * | | + * L V NVA V + * |-------|-------| + */ + va->va_end = nva_start_addr; + } else if (type == NE_FIT_TYPE) { + /* + * Split no edge of fit VA. + * + * | | + * L V NVA V R + * |---|-------|---| + */ + lva = __this_cpu_xchg(ne_fit_preload_node, NULL); + if (unlikely(!lva)) { + /* + * For percpu allocator we do not do any pre-allocation + * and leave it as it is. The reason is it most likely + * never ends up with NE_FIT_TYPE splitting. In case of + * percpu allocations offsets and sizes are aligned to + * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE + * are its main fitting cases. + * + * There are a few exceptions though, as an example it is + * a first allocation (early boot up) when we have "one" + * big free space that has to be split. + * + * Also we can hit this path in case of regular "vmap" + * allocations, if "this" current CPU was not preloaded. + * See the comment in alloc_vmap_area() why. If so, then + * GFP_NOWAIT is used instead to get an extra object for + * split purpose. That is rare and most time does not + * occur. + * + * What happens if an allocation gets failed. Basically, + * an "overflow" path is triggered to purge lazily freed + * areas to free some memory, then, the "retry" path is + * triggered to repeat one more time. See more details + * in alloc_vmap_area() function. + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (!lva) + return -1; + } + + /* + * Build the remainder. + */ + lva->va_start = va->va_start; + lva->va_end = nva_start_addr; + + /* + * Shrink this VA to remaining size. + */ + va->va_start = nva_start_addr + size; + } else { + return -1; + } + + if (type != FL_FIT_TYPE) { + augment_tree_propagate_from(va); + + if (lva) /* type == NE_FIT_TYPE */ + insert_vmap_area_augment(lva, &va->rb_node, root, head); + } + + return 0; +} + +/* + * Returns a start address of the newly allocated area, if success. + * Otherwise a vend is returned that indicates failure. + */ +static __always_inline unsigned long +__alloc_vmap_area(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend) +{ + bool adjust_search_size = true; + unsigned long nva_start_addr; + struct vmap_area *va; + int ret; + + /* + * Do not adjust when: + * a) align <= PAGE_SIZE, because it does not make any sense. + * All blocks(their start addresses) are at least PAGE_SIZE + * aligned anyway; + * b) a short range where a requested size corresponds to exactly + * specified [vstart:vend] interval and an alignment > PAGE_SIZE. + * With adjusted search length an allocation would not succeed. + */ + if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) + adjust_search_size = false; + + va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); + if (unlikely(!va)) + return vend; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); + if (WARN_ON_ONCE(ret)) + return vend; + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK + find_vmap_lowest_match_check(root, head, size, align); +#endif + + return nva_start_addr; +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + /* + * Remove from the busy tree/list. + */ + spin_lock(&vmap_area_lock); + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + /* + * Insert/Merge it back to the free tree/list. + */ + spin_lock(&free_vmap_area_lock); + merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); +} + +static inline void +preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) +{ + struct vmap_area *va = NULL; + + /* + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. It guarantees that + * a CPU that does an allocation is preloaded. + * + * We do it in non-atomic context, thus it allows us to use more + * permissive allocation masks to be more stable under low memory + * condition and high memory pressure. + */ + if (!this_cpu_read(ne_fit_preload_node)) + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(lock); + + if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) + kmem_cache_free(vmap_area_cachep, va); +} + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + unsigned long freed; + unsigned long addr; + int purged = 0; + int ret; + + BUG_ON(!size); + BUG_ON(offset_in_page(size)); + BUG_ON(!is_power_of_2(align)); + + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + + might_sleep(); + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; + + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); + +retry: + preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + + /* + * If an allocation fails, the "vend" address is + * returned. Therefore trigger the overflow path. + */ + if (unlikely(addr == vend)) + goto overflow; + + va->va_start = addr; + va->va_end = addr + size; + va->vm = NULL; + + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); + + BUG_ON(!IS_ALIGNED(va->va_start, align)); + BUG_ON(va->va_start < vstart); + BUG_ON(va->va_end > vend); + + ret = kasan_populate_vmalloc(addr, size); + if (ret) { + free_vmap_area(va); + return ERR_PTR(ret); + } + + return va; + +overflow: + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + + freed = 0; + blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); + + if (freed > 0) { + purged = 0; + goto retry; + } + + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) + pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", + size); + + kmem_cache_free(vmap_area_cachep, va); + return ERR_PTR(-EBUSY); +} + +int register_vmap_purge_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&vmap_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); + +int unregister_vmap_purge_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&vmap_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); + +/* + * Serialize vmap purging. There is no actual critical section protected + * by this lock, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_MUTEX(vmap_purge_lock); + +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + +/* + * Purges all lazily-freed vmap areas. + */ +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) +{ + unsigned long resched_threshold; + struct list_head local_purge_list; + struct vmap_area *va, *n_va; + + lockdep_assert_held(&vmap_purge_lock); + + spin_lock(&purge_vmap_area_lock); + purge_vmap_area_root = RB_ROOT; + list_replace_init(&purge_vmap_area_list, &local_purge_list); + spin_unlock(&purge_vmap_area_lock); + + if (unlikely(list_empty(&local_purge_list))) + return false; + + start = min(start, + list_first_entry(&local_purge_list, + struct vmap_area, list)->va_start); + + end = max(end, + list_last_entry(&local_purge_list, + struct vmap_area, list)->va_end); + + flush_tlb_kernel_range(start, end); + resched_threshold = lazy_max_pages() << 1; + + spin_lock(&free_vmap_area_lock); + list_for_each_entry_safe(va, n_va, &local_purge_list, list) { + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long orig_start = va->va_start; + unsigned long orig_end = va->va_end; + + /* + * Finally insert or merge lazily-freed area. It is + * detached and there is no need to "unlink" it from + * anything. + */ + va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, + &free_vmap_area_list); + + if (!va) + continue; + + if (is_vmalloc_or_module_addr((void *)orig_start)) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); + + atomic_long_sub(nr, &vmap_lazy_nr); + + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) + cond_resched_lock(&free_vmap_area_lock); + } + spin_unlock(&free_vmap_area_lock); + return true; +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); +} + +static void drain_vmap_area_work(struct work_struct *work) +{ + unsigned long nr_lazy; + + do { + mutex_lock(&vmap_purge_lock); + __purge_vmap_area_lazy(ULONG_MAX, 0); + mutex_unlock(&vmap_purge_lock); + + /* Recheck if further work is required. */ + nr_lazy = atomic_long_read(&vmap_lazy_nr); + } while (nr_lazy > lazy_max_pages()); +} + +/* + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. + */ +static void free_vmap_area_noflush(struct vmap_area *va) +{ + unsigned long nr_lazy; + + spin_lock(&vmap_area_lock); + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> + PAGE_SHIFT, &vmap_lazy_nr); + + /* + * Merge or place it to the purge tree/list. + */ + spin_lock(&purge_vmap_area_lock); + merge_or_add_vmap_area(va, + &purge_vmap_area_root, &purge_vmap_area_list); + spin_unlock(&purge_vmap_area_lock); + + /* After this point, we may free va at any time */ + if (unlikely(nr_lazy > lazy_max_pages())) + schedule_work(&drain_vmap_work); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + flush_cache_vunmap(va->va_start, va->va_end); + vunmap_range_noflush(va->va_start, va->va_end); + if (debug_pagealloc_enabled_static()) + flush_tlb_kernel_range(va->va_start, va->va_end); + + free_vmap_area_noflush(va); +} + +struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + return va; +} + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS \ + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + unsigned long free, dirty; + unsigned long dirty_min, dirty_max; /*< dirty range */ + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * XArray of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_XARRAY(vmap_blocks); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) +{ + unsigned long addr; + + addr = va_start + (pages_off << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); + return (void *)addr; +} + +/** + * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this + * block. Of course pages number can't exceed VMAP_BBMAP_BITS + * @order: how many 2^order pages should be occupied in newly allocated block + * @gfp_mask: flags for the page level allocator + * + * Return: virtual address in a newly allocated block or ERR_PTR(-errno) + */ +static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err; + void *vaddr; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (IS_ERR(va)) { + kfree(vb); + return ERR_CAST(va); + } + + vaddr = vmap_block_vaddr(va->va_start, 0); + spin_lock_init(&vb->lock); + vb->va = va; + /* At least something should be left free */ + BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + vb->free = VMAP_BBMAP_BITS - (1UL << order); + vb->dirty = 0; + vb->dirty_min = VMAP_BBMAP_BITS; + vb->dirty_max = 0; + INIT_LIST_HEAD(&vb->free_list); + + vb_idx = addr_to_vb_idx(va->va_start); + err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); + if (err) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + vbq = raw_cpu_ptr(&vmap_block_queue); + spin_lock(&vbq->lock); + list_add_tail_rcu(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + + return vaddr; +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + + tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); + BUG_ON(tmp != vb); + + free_vmap_area_noflush(vb->va); + kfree_rcu(vb, rcu_head); +} + +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + vb->dirty_min = 0; + vb->dirty_max = VMAP_BBMAP_BITS; + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + void *vaddr = NULL; + unsigned int order; + + BUG_ON(offset_in_page(size)); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + if (WARN_ON(size == 0)) { + /* + * Allocating 0 bytes isn't what caller wants since + * get_order(0) returns funny result. Just warn and terminate + * early. + */ + return NULL; + } + order = get_order(size); + + rcu_read_lock(); + vbq = raw_cpu_ptr(&vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long pages_off; + + spin_lock(&vb->lock); + if (vb->free < (1UL << order)) { + spin_unlock(&vb->lock); + continue; + } + + pages_off = VMAP_BBMAP_BITS - vb->free; + vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + } + + spin_unlock(&vb->lock); + break; + } + + rcu_read_unlock(); + + /* Allocate new block if nothing was found */ + if (!vaddr) + vaddr = new_vmap_block(order, gfp_mask); + + return vaddr; +} + +static void vb_free(unsigned long addr, unsigned long size) +{ + unsigned long offset; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(offset_in_page(size)); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + + flush_cache_vunmap(addr, addr + size); + + order = get_order(size); + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; + vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + + vunmap_range_noflush(addr, addr + size); + + if (debug_pagealloc_enabled_static()) + flush_tlb_kernel_range(addr, addr + size); + + spin_lock(&vb->lock); + + /* Expand dirty range */ + vb->dirty_min = min(vb->dirty_min, offset); + vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); + + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) +{ + int cpu; + + if (unlikely(!vmap_initialized)) + return; + + might_sleep(); + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + spin_lock(&vb->lock); + if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { + unsigned long va_start = vb->va->va_start; + unsigned long s, e; + + s = va_start + (vb->dirty_min << PAGE_SHIFT); + e = va_start + (vb->dirty_max << PAGE_SHIFT); + + start = min(s, start); + end = max(e, end); + + flush = 1; + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + mutex_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + mutex_unlock(&vmap_purge_lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int flush = 0; + + _vm_unmap_aliases(start, end, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = (unsigned long)count << PAGE_SHIFT; + unsigned long addr = (unsigned long)kasan_reset_tag(mem); + struct vmap_area *va; + + might_sleep(); + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(!PAGE_ALIGNED(addr)); + + kasan_poison_vmalloc(mem, size); + + if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); + vb_free(addr, size); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); + free_unmap_vmap_area(va); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * + * Returns: a pointer to the address that has been mapped, or %NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node) +{ + unsigned long size = (unsigned long)count << PAGE_SHIFT; + unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + + if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, + pages, PAGE_SHIFT) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + + /* + * Mark the pages as accessible, now that they are mapped. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); + + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +static struct vm_struct *vmlist __initdata; + +static inline unsigned int vm_area_page_order(struct vm_struct *vm) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + return vm->page_order; +#else + return 0; +#endif +} + +static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) +{ +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC + vm->page_order = order; +#else + BUG_ON(order != 0); +#endif +} + +/** + * vm_area_add_early - add vmap area early during boot + * @vm: vm_struct to add + * + * This function is used to add fixed kernel vm area to vmlist before + * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags + * should contain proper values and the other fields should be zero. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_add_early(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + + BUG_ON(vmap_initialized); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) { + BUG_ON(tmp->addr < vm->addr + vm->size); + break; + } else + BUG_ON(tmp->addr + tmp->size > vm->addr); + } + vm->next = *p; + *p = vm; +} + +/** + * vm_area_register_early - register vmap area early during boot + * @vm: vm_struct to register + * @align: requested alignment + * + * This function is used to register kernel vm area before + * vmalloc_init() is called. @vm->size and @vm->flags should contain + * proper values on entry and other fields should be zero. On return, + * vm->addr contains the allocated address. + * + * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. + */ +void __init vm_area_register_early(struct vm_struct *vm, size_t align) +{ + unsigned long addr = ALIGN(VMALLOC_START, align); + struct vm_struct *cur, **p; + + BUG_ON(vmap_initialized); + + for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { + if ((unsigned long)cur->addr - addr >= vm->size) + break; + addr = ALIGN((unsigned long)cur->addr + cur->size, align); + } + + BUG_ON(addr > VMALLOC_END - vm->size); + vm->addr = (void *)addr; + vm->next = *p; + *p = vm; + kasan_populate_early_vm_area_shadow(vm->addr, vm->size); +} + +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, free_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } + + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); + vmap_initialized = true; +} + +static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, + struct vmap_area *va, unsigned long flags, const void *caller) +{ + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->vm = vm; +} + +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, const void *caller) +{ + spin_lock(&vmap_area_lock); + setup_vmalloc_vm_locked(vm, va, flags, caller); + spin_unlock(&vmap_area_lock); +} + +static void clear_vm_uninitialized_flag(struct vm_struct *vm) +{ + /* + * Before removing VM_UNINITIALIZED, + * we should make sure that vm has proper values. + * Pair with smp_rmb() in show_numa_info(). + */ + smp_wmb(); + vm->flags &= ~VM_UNINITIALIZED; +} + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long shift, unsigned long flags, + unsigned long start, unsigned long end, int node, + gfp_t gfp_mask, const void *caller) +{ + struct vmap_area *va; + struct vm_struct *area; + unsigned long requested_size = size; + + BUG_ON(in_interrupt()); + size = ALIGN(size, 1ul << shift); + if (unlikely(!size)) + return NULL; + + if (flags & VM_IOREMAP) + align = 1ul << clamp_t(int, get_count_order_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); + + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!area)) + return NULL; + + if (!(flags & VM_NO_GUARD)) + size += PAGE_SIZE; + + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; + } + + setup_vmalloc_vm(area, va, flags, caller); + + /* + * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a + * best-effort approach, as they can be mapped outside of vmalloc code. + * For VM_ALLOC mappings, the pages are marked as accessible after + * getting mapped in __vmalloc_node_range(). + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + if (!(flags & VM_ALLOC)) + area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, + KASAN_VMALLOC_PROT_NORMAL); + + return area; +} + +struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, + const void *caller) +{ + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, + NUMA_NO_NODE, GFP_KERNEL, caller); +} + +/** + * get_vm_area - reserve a contiguous kernel virtual area + * @size: size of the area + * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC + * + * Search an area of @size in the kernel virtual mapping area, + * and reserved it for out purposes. Returns the area descriptor + * on success or %NULL on failure. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) +{ + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, + __builtin_return_address(0)); +} + +struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, + const void *caller) +{ + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, + NUMA_NO_NODE, GFP_KERNEL, caller); +} + +/** + * find_vm_area - find a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and return it. + * It is up to the caller to do all required locking to keep the returned + * pointer valid. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *find_vm_area(const void *addr) +{ + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (!va) + return NULL; + + return va->vm; +} + +/** + * remove_vm_area - find and remove a continuous kernel virtual area + * @addr: base address + * + * Search for the kernel VM area starting at @addr, and remove it. + * This function returns the found VM area, but using it is NOT safe + * on SMP machines, except for its size or flags. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *remove_vm_area(const void *addr) +{ + struct vmap_area *va; + + might_sleep(); + + spin_lock(&vmap_area_lock); + va = __find_vmap_area((unsigned long)addr, &vmap_area_root); + if (va && va->vm) { + struct vm_struct *vm = va->vm; + + va->vm = NULL; + spin_unlock(&vmap_area_lock); + + kasan_free_module_shadow(vm); + free_unmap_vmap_area(va); + + return vm; + } + + spin_unlock(&vmap_area_lock); + return NULL; +} + +static inline void set_area_direct_map(const struct vm_struct *area, + int (*set_direct_map)(struct page *page)) +{ + int i; + + /* HUGE_VMALLOC passes small pages to set_direct_map */ + for (i = 0; i < area->nr_pages; i++) + if (page_address(area->pages[i])) + set_direct_map(area->pages[i]); +} + +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +{ + unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int flush_dmap = 0; + int i; + + remove_vm_area(area->addr); + + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ + if (!flush_reset) + return; + + /* + * If not deallocating pages, just do the flush of the VM area and + * return. + */ + if (!deallocate_pages) { + vm_unmap_aliases(); + return; + } + + /* + * If execution gets here, flush the vm mapping and reset the direct + * map. Find the start and end range of the direct mappings to make sure + * the vm_unmap_aliases() flush includes the direct map. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { + unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; + start = min(addr, start); + end = max(addr + page_size, end); + flush_dmap = 1; + } + } + + /* + * Set direct map to something invalid so that it won't be cached if + * there are any accesses after the TLB flush, then flush the TLB and + * reset the direct map permissions to the default. + */ + set_area_direct_map(area, set_direct_map_invalid_noflush); + _vm_unmap_aliases(start, end, flush_dmap); + set_area_direct_map(area, set_direct_map_default_noflush); +} + +static void __vunmap(const void *addr, int deallocate_pages) +{ + struct vm_struct *area; + + if (!addr) + return; + + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return; + + area = find_vm_area(addr); + if (unlikely(!area)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + + kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); + + vm_remove_mappings(area, deallocate_pages); + + if (deallocate_pages) { + int i; + + for (i = 0; i < area->nr_pages; i++) { + struct page *page = area->pages[i]; + + BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); + cond_resched(); + } + atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); + + kvfree(area->pages); + } + + kfree(area); +} + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * another cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + +static void __vfree(const void *addr) +{ + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else + __vunmap(addr, 1); +} + +/** + * vfree - Release memory allocated by vmalloc() + * @addr: Memory base address + * + * Free the virtually continuous memory area starting at @addr, as obtained + * from one of the vmalloc() family of APIs. This will usually also free the + * physical memory underlying the virtual allocation, but that memory is + * reference counted, so it will not be freed until the last user goes away. + * + * If @addr is NULL, no operation is performed. + * + * Context: + * May sleep if called *not* from interrupt context. + * Must not be called in NMI context (strictly speaking, it could be + * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling + * conventions for vfree() arch-dependent would be a really bad idea). + */ +void vfree(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + might_sleep_if(!in_interrupt()); + + if (!addr) + return; + + __vfree(addr); +} +EXPORT_SYMBOL(vfree); + +/** + * vunmap - release virtual mapping obtained by vmap() + * @addr: memory base address + * + * Free the virtually contiguous memory area starting at @addr, + * which was created from the page array passed to vmap(). + * + * Must not be called in interrupt context. + */ +void vunmap(const void *addr) +{ + BUG_ON(in_interrupt()); + might_sleep(); + if (addr) + __vunmap(addr, 0); +} +EXPORT_SYMBOL(vunmap); + +/** + * vmap - map an array of pages into virtually contiguous space + * @pages: array of page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual space. + * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself + * (which must be kmalloc or vmalloc memory) and one reference per pages in it + * are transferred from the caller to vmap(), and will be freed / dropped when + * vfree() is called on the return value. + * + * Return: the address of the area or %NULL on failure + */ +void *vmap(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long addr; + unsigned long size; /* In bytes */ + + might_sleep(); + + /* + * Your top guard is someone else's bottom guard. Not having a top + * guard compromises someone else's mappings too. + */ + if (WARN_ON_ONCE(flags & VM_NO_GUARD)) + flags &= ~VM_NO_GUARD; + + if (count > totalram_pages()) + return NULL; + + size = (unsigned long)count << PAGE_SHIFT; + area = get_vm_area_caller(size, flags, __builtin_return_address(0)); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), + pages, PAGE_SHIFT) < 0) { + vunmap(area->addr); + return NULL; + } + + if (flags & VM_MAP_PUT_PAGES) { + area->pages = pages; + area->nr_pages = count; + } + return area->addr; +} +EXPORT_SYMBOL(vmap); + +#ifdef CONFIG_VMAP_PFN +struct vmap_pfn_data { + unsigned long *pfns; + pgprot_t prot; + unsigned int idx; +}; + +static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) +{ + struct vmap_pfn_data *data = private; + + if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) + return -EINVAL; + *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); + return 0; +} + +/** + * vmap_pfn - map an array of PFNs into virtually contiguous space + * @pfns: array of PFNs + * @count: number of pages to map + * @prot: page protection for the mapping + * + * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns + * the start address of the mapping. + */ +void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) +{ + struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; + struct vm_struct *area; + + area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, + __builtin_return_address(0)); + if (!area) + return NULL; + if (apply_to_page_range(&init_mm, (unsigned long)area->addr, + count * PAGE_SIZE, vmap_pfn_apply, &data)) { + free_vm_area(area); + return NULL; + } + + flush_cache_vmap((unsigned long)area->addr, + (unsigned long)area->addr + count * PAGE_SIZE); + + return area->addr; +} +EXPORT_SYMBOL_GPL(vmap_pfn); +#endif /* CONFIG_VMAP_PFN */ + +static inline unsigned int +vm_area_alloc_pages(gfp_t gfp, int nid, + unsigned int order, unsigned int nr_pages, struct page **pages) +{ + unsigned int nr_allocated = 0; + struct page *page; + int i; + + /* + * For order-0 pages we make use of bulk allocator, if + * the page array is partly or not at all populated due + * to fails, fallback to a single page allocator that is + * more permissive. + */ + if (!order) { + gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; + + while (nr_allocated < nr_pages) { + unsigned int nr, nr_pages_request; + + /* + * A maximum allowed request is hard-coded and is 100 + * pages per call. That is done in order to prevent a + * long preemption off scenario in the bulk-allocator + * so the range is [1:100]. + */ + nr_pages_request = min(100U, nr_pages - nr_allocated); + + /* memory allocation should consider mempolicy, we can't + * wrongly use nearest node when nid == NUMA_NO_NODE, + * otherwise memory may be allocated in only one node, + * but mempolicy wants to alloc memory by interleaving. + */ + if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) + nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, + nr_pages_request, + pages + nr_allocated); + + else + nr = alloc_pages_bulk_array_node(bulk_gfp, nid, + nr_pages_request, + pages + nr_allocated); + + nr_allocated += nr; + cond_resched(); + + /* + * If zero or pages were obtained partly, + * fallback to a single page allocator. + */ + if (nr != nr_pages_request) + break; + } + } + + /* High-order pages or fallback path if "bulk" fails. */ + + while (nr_allocated < nr_pages) { + if (fatal_signal_pending(current)) + break; + + if (nid == NUMA_NO_NODE) + page = alloc_pages(gfp, order); + else + page = alloc_pages_node(nid, gfp, order); + if (unlikely(!page)) + break; + /* + * Higher order allocations must be able to be treated as + * indepdenent small pages by callers (as they can with + * small-page vmallocs). Some drivers do their own refcounting + * on vmalloc_to_page() pages, some use page->mapping, + * page->lru, etc. + */ + if (order) + split_page(page, order); + + /* + * Careful, we allocate and map page-order pages, but + * tracking is done per PAGE_SIZE page so as to keep the + * vm_struct APIs independent of the physical/mapped size. + */ + for (i = 0; i < (1U << order); i++) + pages[nr_allocated + i] = page + i; + + cond_resched(); + nr_allocated += 1U << order; + } + + return nr_allocated; +} + +static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, unsigned int page_shift, + int node) +{ + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + bool nofail = gfp_mask & __GFP_NOFAIL; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); + unsigned long array_size; + unsigned int nr_small_pages = size >> PAGE_SHIFT; + unsigned int page_order; + unsigned int flags; + int ret; + + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; + + /* Please note that the recursion is strictly bounded. */ + if (array_size > PAGE_SIZE) { + area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->caller); + } else { + area->pages = kmalloc_node(array_size, nested_gfp, node); + } + + if (!area->pages) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, failed to allocated page array size %lu", + nr_small_pages * PAGE_SIZE, array_size); + free_vm_area(area); + return NULL; + } + + set_vm_area_page_order(area, page_shift - PAGE_SHIFT); + page_order = vm_area_page_order(area); + + area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, nr_small_pages, area->pages); + + atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + int i; + + for (i = 0; i < area->nr_pages; i++) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); + } + + /* + * If not enough pages were obtained to accomplish an + * allocation request, free them via __vfree() if any. + */ + if (area->nr_pages != nr_small_pages) { + /* vm_area_alloc_pages() can also fail due to a fatal signal */ + if (!fatal_signal_pending(current)) + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, page order %u, failed to allocate pages", + area->nr_pages * PAGE_SIZE, page_order); + goto fail; + } + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + ret = vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, failed to map pages", + area->nr_pages * PAGE_SIZE); + goto fail; + } + + return area->addr; + +fail: + __vfree(area->addr); + return NULL; +} + +/** + * __vmalloc_node_range - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @start: vm area range start + * @end: vm area range end + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Please note that the full set of gfp + * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all + * supported. + * Zone modifiers are not supported. From the reclaim modifiers + * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) + * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and + * __GFP_RETRY_MAYFAIL are not supported). + * + * __GFP_NOWARN can be used to suppress failures messages. + * + * Map them into contiguous kernel virtual space, using a pagetable + * protection of @prot. + * + * Return: the address of the area or %NULL on failure + */ +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +{ + struct vm_struct *area; + void *ret; + kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; + unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; + + if (WARN_ON_ONCE(!size)) + return NULL; + + if ((size >> PAGE_SHIFT) > totalram_pages()) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, exceeds total pages", + real_size); + return NULL; + } + + if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) + shift = PMD_SHIFT; + else + shift = arch_vmap_pte_supported_shift(size_per_node); + + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + +again: + area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | + VM_UNINITIALIZED | vm_flags, start, end, node, + gfp_mask, caller); + if (!area) { + bool nofail = gfp_mask & __GFP_NOFAIL; + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, vm_struct allocation failed%s", + real_size, (nofail) ? ". Retrying." : ""); + if (nofail) { + schedule_timeout_uninterruptible(1); + goto again; + } + goto fail; + } + + /* + * Prepare arguments for __vmalloc_area_node() and + * kasan_unpoison_vmalloc(). + */ + if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { + if (kasan_hw_tags_enabled()) { + /* + * Modify protection bits to allow tagging. + * This must be done before mapping. + */ + prot = arch_vmap_pgprot_tagged(prot); + + /* + * Skip page_alloc poisoning and zeroing for physical + * pages backing VM_ALLOC mapping. Memory is instead + * poisoned and zeroed by kasan_unpoison_vmalloc(). + */ + gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; + } + + /* Take note that the mapping is PAGE_KERNEL. */ + kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; + } + + /* Allocate physical pages and map them into vmalloc space. */ + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) + goto fail; + + /* + * Mark the pages as accessible, now that they are mapped. + * The condition for setting KASAN_VMALLOC_INIT should complement the + * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check + * to make sure that memory is initialized under the same conditions. + * Tag-based KASAN modes only assign tags to normal non-executable + * allocations, see __kasan_unpoison_vmalloc(). + */ + kasan_flags |= KASAN_VMALLOC_VM_ALLOC; + if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && + (gfp_mask & __GFP_SKIP_ZERO)) + kasan_flags |= KASAN_VMALLOC_INIT; + /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ + area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); + + /* + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. + * Now, it is fully initialized, so remove this flag here. + */ + clear_vm_uninitialized_flag(area); + + size = PAGE_ALIGN(size); + if (!(vm_flags & VM_DEFER_KMEMLEAK)) + kmemleak_vmalloc(area, size, gfp_mask); + + return area->addr; + +fail: + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + + return NULL; +} + +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @node: node to use for allocation or NUMA_NO_NODE + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. + * + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL + * and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, PAGE_KERNEL, 0, node, caller); +} +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif + +void *__vmalloc(unsigned long size, gfp_t gfp_mask) +{ + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__vmalloc); + +/** + * vmalloc - allocate virtually contiguous memory + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc(unsigned long size) +{ + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc); + +/** + * vmalloc_huge - allocate virtually contiguous memory, allow huge pages + * @size: allocation size + * @gfp_mask: flags for the page level allocator + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * If @size is greater than or equal to PMD_SIZE, allow using + * huge pages for the memory + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL_GPL(vmalloc_huge); + +/** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace + * @size: allocation size + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_user(unsigned long size) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_user); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_node); + +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vzalloc_node); + +#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) +#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) +#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) +#else +/* + * 64b systems should always have either DMA or DMA32 zones. For others + * GFP_DMA32 should do the right thing and use the normal zone. + */ +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) +#endif + +/** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the + * page level allocator and map them into contiguous kernel virtual space. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_32(unsigned long size) +{ + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_32); + +/** + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_32_user(unsigned long size) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_32_user); + +/* + * small helper routine , copy contents to buf from addr. + * If the page is not present, fill zero. + */ + +static int aligned_vread(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = offset_in_page(addr); + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calls for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* We can expect USER0 is not used -- see vread() */ + void *map = kmap_atomic(p); + memcpy(buf, map + offset, length); + kunmap_atomic(map); + } else + memset(buf, 0, length); + + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +/** + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. @buf should be kernel's buffer. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any information, as /proc/kcore. + * + * Return: number of bytes for which addr and buf should be increased + * (same number as @count) or %0 if [addr...addr+count) doesn't + * include any intersection with valid vmalloc area + */ +long vread(char *buf, char *addr, unsigned long count) +{ + struct vmap_area *va; + struct vm_struct *vm; + char *vaddr, *buf_start = buf; + unsigned long buflen = count; + unsigned long n; + + addr = kasan_reset_tag(addr); + + /* Don't allow overflow */ + if ((unsigned long) addr + count < count) + count = -(unsigned long) addr; + + spin_lock(&vmap_area_lock); + va = find_vmap_area_exceed_addr((unsigned long)addr); + if (!va) + goto finished; + + /* no intersects with alive vmap_area */ + if ((unsigned long)addr + count <= va->va_start) + goto finished; + + list_for_each_entry_from(va, &vmap_area_list, list) { + if (!count) + break; + + if (!va->vm) + continue; + + vm = va->vm; + vaddr = (char *) vm->addr; + if (addr >= vaddr + get_vm_area_size(vm)) + continue; + while (addr < vaddr) { + if (count == 0) + goto finished; + *buf = '\0'; + buf++; + addr++; + count--; + } + n = vaddr + get_vm_area_size(vm) - addr; + if (n > count) + n = count; + if (!(vm->flags & VM_IOREMAP)) + aligned_vread(buf, addr, n); + else /* IOREMAP area is treated as memory hole */ + memset(buf, 0, n); + buf += n; + addr += n; + count -= n; + } +finished: + spin_unlock(&vmap_area_lock); + + if (buf == buf_start) + return 0; + /* zero-fill memory holes */ + if (buf != buf_start + buflen) + memset(buf, 0, buflen - (buf - buf_start)); + + return buflen; +} + +/** + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @pgoff: offset from @kaddr to start at + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long pgoff, + unsigned long size) +{ + struct vm_struct *area; + unsigned long off; + unsigned long end_index; + + if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) + return -EINVAL; + + size = PAGE_ALIGN(size); + + if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) + return -EINVAL; + + if (check_add_overflow(size, off, &end_index) || + end_index > get_vm_area_size(area)) + return -EINVAL; + kaddr += off; + + do { + struct page *page = vmalloc_to_page(kaddr); + int ret; + + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PAGE_SIZE; + kaddr += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size > 0); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + return 0; +} + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_range_partial(vma, vma->vm_start, + addr, pgoff, + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_range); + +void free_vm_area(struct vm_struct *area) +{ + struct vm_struct *ret; + ret = remove_vm_area(area->addr); + BUG_ON(ret != area); + kfree(area); +} +EXPORT_SYMBOL_GPL(free_vm_area); + +#ifdef CONFIG_SMP +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return rb_entry_safe(n, struct vmap_area, rb_node); +} + +/** + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to + * @addr: target address + * + * Returns: vmap_area if it is found. If there is no such area + * the first highest(reverse order) vmap_area is returned + * i.e. va->va_start < addr && va->va_end < addr or NULL + * if there are no any areas before @addr. + */ +static struct vmap_area * +pvm_find_va_enclose_addr(unsigned long addr) +{ + struct vmap_area *va, *tmp; + struct rb_node *n; + + n = free_vmap_area_root.rb_node; + va = NULL; + + while (n) { + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_start <= addr) { + va = tmp; + if (tmp->va_end >= addr) + break; + + n = n->rb_right; + } else { + n = n->rb_left; + } + } + + return va; +} + +/** + * pvm_determine_end_from_reverse - find the highest aligned address + * of free block below VMALLOC_END + * @va: + * in - the VA we start the search(reverse order); + * out - the VA with the highest aligned end address. + * @align: alignment for required highest address + * + * Returns: determined end address within vmap_area + */ +static unsigned long +pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) +{ + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (likely(*va)) { + list_for_each_entry_from_reverse((*va), + &free_vmap_area_list, list) { + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); + if ((*va)->va_start < addr) + return addr; + } + } + + return 0; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans free blocks from the end looking + * for matching base. While scanning, if any of the areas do not fit the + * base address is pulled down to fit the area. Scanning is repeated till + * all the areas fit and then all necessary data structures are inserted + * and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *va; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, size, end, last_end, orig_start, orig_end; + bool purged = false; + + /* verify parameters and allocate data structures */ + BUG_ON(offset_in_page(align) || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = area + 1; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + BUG_ON(start2 < end && start < end2); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); + vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); + if (!vas || !vms) + goto err_free2; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&free_vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + va = pvm_find_va_enclose_addr(vmalloc_end); + base = pvm_determine_end_from_reverse(&va, align) - end; + + while (true) { + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) + goto overflow; + + /* + * Fitting base has not been found. + */ + if (va == NULL) + goto overflow; + + /* + * If required width exceeds current VA block, move + * base downwards and then recheck. + */ + if (base + end > va->va_end) { + base = pvm_determine_end_from_reverse(&va, align) - end; + term_area = area; + continue; + } + + /* + * If this VA does not fit, move base downwards and recheck. + */ + if (base + start < va->va_start) { + va = node_to_va(rb_prev(&va->rb_node)); + base = pvm_determine_end_from_reverse(&va, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + + start = offsets[area]; + end = start + sizes[area]; + va = pvm_find_va_enclose_addr(base + end); + } + + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + int ret; + + start = base + offsets[area]; + size = sizes[area]; + + va = pvm_find_va_enclose_addr(start); + if (WARN_ON_ONCE(va == NULL)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + ret = adjust_va_to_fit_type(&free_vmap_area_root, + &free_vmap_area_list, + va, start, size); + if (WARN_ON_ONCE(unlikely(ret))) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + /* Allocated area. */ + va = vas[area]; + va->va_start = start; + va->va_end = start + size; + } + + spin_unlock(&free_vmap_area_lock); + + /* populate the kasan shadow space */ + for (area = 0; area < nr_vms; area++) { + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + goto err_free_shadow; + } + + /* insert all vm's */ + spin_lock(&vmap_area_lock); + for (area = 0; area < nr_vms; area++) { + insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + + setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + } + spin_unlock(&vmap_area_lock); + + /* + * Mark allocated areas as accessible. Do it now as a best-effort + * approach, as they can be mapped outside of vmalloc code. + * With hardware tag-based KASAN, marking is skipped for + * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). + */ + for (area = 0; area < nr_vms; area++) + vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, + vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + + kfree(vas); + return vms; + +recovery: + /* + * Remove previously allocated areas. There is no + * need in removing these areas from the busy tree, + * because they are inserted only on the final step + * and when pcpu_get_vm_areas() is success. + */ + while (area--) { + orig_start = vas[area]->va_start; + orig_end = vas[area]->va_end; + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, + &free_vmap_area_list); + if (va) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); + vas[area] = NULL; + } + +overflow: + spin_unlock(&free_vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + + /* Before "retry", check if we recover. */ + for (area = 0; area < nr_vms; area++) { + if (vas[area]) + continue; + + vas[area] = kmem_cache_zalloc( + vmap_area_cachep, GFP_KERNEL); + if (!vas[area]) + goto err_free; + } + + goto retry; + } + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas[area]) + kmem_cache_free(vmap_area_cachep, vas[area]); + + kfree(vms[area]); + } +err_free2: + kfree(vas); + kfree(vms); + return NULL; + +err_free_shadow: + spin_lock(&free_vmap_area_lock); + /* + * We release all the vmalloc shadows, even the ones for regions that + * hadn't been successfully added. This relies on kasan_release_vmalloc + * being able to tolerate this case. + */ + for (area = 0; area < nr_vms; area++) { + orig_start = vas[area]->va_start; + orig_end = vas[area]->va_end; + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, + &free_vmap_area_list); + if (va) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); + vas[area] = NULL; + kfree(vms[area]); + } + spin_unlock(&free_vmap_area_lock); + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PRINTK +bool vmalloc_dump_obj(void *object) +{ + void *objp = (void *)PAGE_ALIGN((unsigned long)object); + const void *caller; + struct vm_struct *vm; + struct vmap_area *va; + unsigned long addr; + unsigned int nr_pages; + + if (!spin_trylock(&vmap_area_lock)) + return false; + va = __find_vmap_area((unsigned long)objp, &vmap_area_root); + if (!va) { + spin_unlock(&vmap_area_lock); + return false; + } + + vm = va->vm; + if (!vm) { + spin_unlock(&vmap_area_lock); + return false; + } + addr = (unsigned long)vm->addr; + caller = vm->caller; + nr_pages = vm->nr_pages; + spin_unlock(&vmap_area_lock); + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + nr_pages, addr, caller); + return true; +} +#endif + +#ifdef CONFIG_PROC_FS +static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmap_purge_lock) + __acquires(&vmap_area_lock) +{ + mutex_lock(&vmap_purge_lock); + spin_lock(&vmap_area_lock); + + return seq_list_start(&vmap_area_list, *pos); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &vmap_area_list, pos); +} + +static void s_stop(struct seq_file *m, void *p) + __releases(&vmap_area_lock) + __releases(&vmap_purge_lock) +{ + spin_unlock(&vmap_area_lock); + mutex_unlock(&vmap_purge_lock); +} + +static void show_numa_info(struct seq_file *m, struct vm_struct *v) +{ + if (IS_ENABLED(CONFIG_NUMA)) { + unsigned int nr, *counters = m->private; + unsigned int step = 1U << vm_area_page_order(v); + + if (!counters) + return; + + if (v->flags & VM_UNINITIALIZED) + return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); + } +} + +static void show_purge_info(struct seq_file *m) +{ + struct vmap_area *va; + + spin_lock(&purge_vmap_area_lock); + list_for_each_entry(va, &purge_vmap_area_list, list) { + seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + } + spin_unlock(&purge_vmap_area_lock); +} + +static int s_show(struct seq_file *m, void *p) +{ + struct vmap_area *va; + struct vm_struct *v; + + va = list_entry(p, struct vmap_area, list); + + /* + * s_show can encounter race with remove_vm_area, !vm on behalf + * of vmap area is being tear down or vm_map_ram allocation. + */ + if (!va->vm) { + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + + goto final; + } + + v = va->vm; + + seq_printf(m, "0x%pK-0x%pK %7ld", + v->addr, v->addr + v->size, v->size); + + if (v->caller) + seq_printf(m, " %pS", v->caller); + + if (v->nr_pages) + seq_printf(m, " pages=%d", v->nr_pages); + + if (v->phys_addr) + seq_printf(m, " phys=%pa", &v->phys_addr); + + if (v->flags & VM_IOREMAP) + seq_puts(m, " ioremap"); + + if (v->flags & VM_ALLOC) + seq_puts(m, " vmalloc"); + + if (v->flags & VM_MAP) + seq_puts(m, " vmap"); + + if (v->flags & VM_USERMAP) + seq_puts(m, " user"); + + if (v->flags & VM_DMA_COHERENT) + seq_puts(m, " dma-coherent"); + + if (is_vmalloc_addr(v->pages)) + seq_puts(m, " vpages"); + + show_numa_info(m, v); + seq_putc(m, '\n'); + + /* + * As a final step, dump "unpurged" areas. + */ +final: + if (list_is_last(&va->list, &vmap_area_list)) + show_purge_info(m); + + return 0; +} + +static const struct seq_operations vmalloc_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int __init proc_vmalloc_init(void) +{ + if (IS_ENABLED(CONFIG_NUMA)) + proc_create_seq_private("vmallocinfo", 0400, NULL, + &vmalloc_op, + nr_node_ids * sizeof(unsigned int), NULL); + else + proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); + return 0; +} +module_init(proc_vmalloc_init); + +#endif diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000..22c6689d9 --- /dev/null +++ b/mm/vmpressure.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Linux VM pressure + * + * Copyright 2012 Linaro Ltd. + * Anton Vorontsov + * + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The window size (vmpressure_win) is the number of scanned pages before + * we try to analyze scanned/reclaimed ratio. So the window is used as a + * rate-limit tunable for the "low" level notification, and also for + * averaging the ratio for medium/critical levels. Using small window + * sizes can cause lot of false positives, but too big window size will + * delay the notifications. + * + * As the vmscan reclaimer logic works with chunks which are multiple of + * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. + * + * TODO: Make the window size depend on machine size, as we do for vmstat + * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). + */ +static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + +/* + * These thresholds are used when we account memory pressure through + * scanned/reclaimed ratio. The current values were chosen empirically. In + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ +static const unsigned int vmpressure_level_med = 60; +static const unsigned int vmpressure_level_critical = 95; + +/* + * When there are too little pages left to scan, vmpressure() may miss the + * critical pressure as number of pages will be less than "window size". + * However, in that case the vmscan priority will raise fast as the + * reclaimer will try to scan LRUs more deeply. + * + * The vmscan logic considers these special priorities: + * + * prio == DEF_PRIORITY (12): reclaimer starts with that value + * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed + * prio == 0 : close to OOM, kernel scans every page in an lru + * + * Any value in this range is acceptable for this tunable (i.e. from 12 to + * 0). Current value for the vmpressure_level_critical_prio is chosen + * empirically, but the number, in essence, means that we consider + * critical level when scanning depth is ~10% of the lru size (vmscan + * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one + * eights). + */ +static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); + +static struct vmpressure *work_to_vmpressure(struct work_struct *work) +{ + return container_of(work, struct vmpressure, work); +} + +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr); + + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return memcg_to_vmpressure(memcg); +} + +enum vmpressure_levels { + VMPRESSURE_LOW = 0, + VMPRESSURE_MEDIUM, + VMPRESSURE_CRITICAL, + VMPRESSURE_NUM_LEVELS, +}; + +enum vmpressure_modes { + VMPRESSURE_NO_PASSTHROUGH = 0, + VMPRESSURE_HIERARCHY, + VMPRESSURE_LOCAL, + VMPRESSURE_NUM_MODES, +}; + +static const char * const vmpressure_str_levels[] = { + [VMPRESSURE_LOW] = "low", + [VMPRESSURE_MEDIUM] = "medium", + [VMPRESSURE_CRITICAL] = "critical", +}; + +static const char * const vmpressure_str_modes[] = { + [VMPRESSURE_NO_PASSTHROUGH] = "default", + [VMPRESSURE_HIERARCHY] = "hierarchy", + [VMPRESSURE_LOCAL] = "local", +}; + +static enum vmpressure_levels vmpressure_level(unsigned long pressure) +{ + if (pressure >= vmpressure_level_critical) + return VMPRESSURE_CRITICAL; + else if (pressure >= vmpressure_level_med) + return VMPRESSURE_MEDIUM; + return VMPRESSURE_LOW; +} + +static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long scale = scanned + reclaimed; + unsigned long pressure = 0; + + /* + * reclaimed can be greater than scanned for things such as reclaimed + * slab pages. shrink_node() just adds reclaimed pages without a + * related increment to scanned pages. + */ + if (reclaimed >= scanned) + goto out; + /* + * We calculate the ratio (in percents) of how many pages were + * scanned vs. reclaimed in a given time frame (window). Note that + * time is in VM reclaimer's "ticks", i.e. number of pages + * scanned. This makes it possible to set desired reaction time + * and serves as a ratelimit. + */ + pressure = scale - (reclaimed * scale / scanned); + pressure = pressure * 100 / scale; + +out: + pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, + scanned, reclaimed); + + return vmpressure_level(pressure); +} + +struct vmpressure_event { + struct eventfd_ctx *efd; + enum vmpressure_levels level; + enum vmpressure_modes mode; + struct list_head node; +}; + +static bool vmpressure_event(struct vmpressure *vmpr, + const enum vmpressure_levels level, + bool ancestor, bool signalled) +{ + struct vmpressure_event *ev; + bool ret = false; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ancestor && ev->mode == VMPRESSURE_LOCAL) + continue; + if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) + continue; + if (level < ev->level) + continue; + eventfd_signal(ev->efd, 1); + ret = true; + } + mutex_unlock(&vmpr->events_lock); + + return ret; +} + +static void vmpressure_work_fn(struct work_struct *work) +{ + struct vmpressure *vmpr = work_to_vmpressure(work); + unsigned long scanned; + unsigned long reclaimed; + enum vmpressure_levels level; + bool ancestor = false; + bool signalled = false; + + spin_lock(&vmpr->sr_lock); + /* + * Several contexts might be calling vmpressure(), so it is + * possible that the work was rescheduled again before the old + * work context cleared the counters. In that case we will run + * just after the old work returns, but then scanned might be zero + * here. No need for any locks here since we don't care if + * vmpr->reclaimed is in sync. + */ + scanned = vmpr->tree_scanned; + if (!scanned) { + spin_unlock(&vmpr->sr_lock); + return; + } + + reclaimed = vmpr->tree_reclaimed; + vmpr->tree_scanned = 0; + vmpr->tree_reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + do { + if (vmpressure_event(vmpr, level, ancestor, signalled)) + signalled = true; + ancestor = true; + } while ((vmpr = vmpressure_parent(vmpr))); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure *vmpr; + + if (mem_cgroup_disabled()) + return; + + /* + * The in-kernel users only care about the reclaim efficiency + * for this @memcg rather than the whole subtree, and there + * isn't and won't be any in-kernel user in a legacy cgroup. + */ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !tree) + return; + + vmpr = memcg_to_vmpressure(memcg); + + /* + * Here we only want to account pressure that userland is able to + * help us with. For example, suppose that DMA zone is under + * pressure; if we notify userland about that kind of pressure, + * then it will be mostly a waste as it will trigger unnecessary + * freeing of memory by userland (since userland is more likely to + * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That + * is why we include only movable, highmem and FS/IO pages. + * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so + * we account it too. + */ + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + /* + * If we got here with no pages scanned, then that is an indicator + * that reclaimer was unable to find any shrinkable LRUs at the + * current scanning depth. But it does not mean that we should + * report the critical pressure, yet. If the scanning priority + * (scanning depth) goes too high (deep), we will be notified + * through vmpressure_prio(). But so far, keep calm. + */ + if (!scanned) + return; + + if (tree) { + spin_lock(&vmpr->sr_lock); + scanned = vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + spin_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); + } else { + enum vmpressure_levels level; + + /* For now, no users for root-level efficiency */ + if (!memcg || mem_cgroup_is_root(memcg)) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned += scanned; + reclaimed = vmpr->reclaimed += reclaimed; + if (scanned < vmpressure_win) { + spin_unlock(&vmpr->sr_lock); + return; + } + vmpr->scanned = vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + if (level > VMPRESSURE_LOW) { + /* + * Let the socket buffer allocator know that + * we are having trouble reclaiming LRU pages. + * + * For hysteresis keep the pressure state + * asserted for a second in which subsequent + * pressure events can occur. + */ + WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); + } + } +} + +/** + * vmpressure_prio() - Account memory pressure through reclaimer priority level + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @prio: reclaimer's priority + * + * This function should be called from the reclaim path every time when + * the vmscan's reclaiming priority (scanning depth) changes. + * + * This function does not return any value. + */ +void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) +{ + /* + * We only use prio for accounting critical level. For more info + * see comment for vmpressure_level_critical_prio variable above. + */ + if (prio > vmpressure_level_critical_prio) + return; + + /* + * OK, the prio is below the threshold, updating vmpressure + * information before shrinker dives into long shrinking of long + * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 + * to the vmpressure() basically means that we signal 'critical' + * level. + */ + vmpressure(gfp, memcg, true, vmpressure_win, 0); +} + +#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) + +/** + * vmpressure_register_event() - Bind vmpressure notifications to an eventfd + * @memcg: memcg that is interested in vmpressure notifications + * @eventfd: eventfd context to link notifications with + * @args: event arguments (pressure level threshold, optional mode) + * + * This function associates eventfd context with the vmpressure + * infrastructure, so that the notifications will be delivered to the + * @eventfd. The @args parameter is a comma-delimited string that denotes a + * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", + * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. + * "hierarchy" or "local"). + * + * To be used as memcg event method. + * + * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could + * not be parsed. + */ +int vmpressure_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure_event *ev; + enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; + enum vmpressure_levels level; + char *spec, *spec_orig; + char *token; + int ret = 0; + + spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); + if (!spec) + return -ENOMEM; + + /* Find required level */ + token = strsep(&spec, ","); + ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (ret < 0) + goto out; + level = ret; + + /* Find optional mode */ + token = strsep(&spec, ","); + if (token) { + ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (ret < 0) + goto out; + mode = ret; + } + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + ret = -ENOMEM; + goto out; + } + + ev->efd = eventfd; + ev->level = level; + ev->mode = mode; + + mutex_lock(&vmpr->events_lock); + list_add(&ev->node, &vmpr->events); + mutex_unlock(&vmpr->events_lock); + ret = 0; +out: + kfree(spec_orig); + return ret; +} + +/** + * vmpressure_unregister_event() - Unbind eventfd from vmpressure + * @memcg: memcg handle + * @eventfd: eventfd context that was used to link vmpressure with the @cg + * + * This function does internal manipulations to detach the @eventfd from + * the vmpressure notifications, and then frees internal resources + * associated with the @eventfd (but the @eventfd itself is not freed). + * + * To be used as memcg event method. + */ +void vmpressure_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + struct vmpressure_event *ev; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ev->efd != eventfd) + continue; + list_del(&ev->node); + kfree(ev); + break; + } + mutex_unlock(&vmpr->events_lock); +} + +/** + * vmpressure_init() - Initialize vmpressure control structure + * @vmpr: Structure to be initialized + * + * This function should be called on every allocated vmpressure structure + * before any usage. + */ +void vmpressure_init(struct vmpressure *vmpr) +{ + spin_lock_init(&vmpr->sr_lock); + mutex_init(&vmpr->events_lock); + INIT_LIST_HEAD(&vmpr->events); + INIT_WORK(&vmpr->work, vmpressure_work_fn); +} + +/** + * vmpressure_cleanup() - shuts down vmpressure control structure + * @vmpr: Structure to be cleaned up + * + * This function should be called before the structure in which it is + * embedded is cleaned up. + */ +void vmpressure_cleanup(struct vmpressure *vmpr) +{ + /* + * Make sure there is no pending work before eventfd infrastructure + * goes away. + */ + flush_work(&vmpr->work); +} diff --git a/mm/vmscan.c b/mm/vmscan.c new file mode 100644 index 000000000..9f3cfb7ca --- /dev/null +++ b/mm/vmscan.c @@ -0,0 +1,7793 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Swap reorganised 29.12.95, Stephen Tweedie. + * kswapd added: 7.1.96 sct + * Removed kswapd_ctl limits, and swap out as many pages as needed + * to bring the system back to freepages.high: 2.4.97, Rik van Riel. + * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). + * Multiqueue VM started 5.8.00, Rik van Riel. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for buffer_heads_over_limit */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "swap.h" + +#define CREATE_TRACE_POINTS +#include + +struct scan_control { + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; + + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + + /* Can active folios be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + + /* Writepage batching in laptop mode; RECLAIM_WRITE */ + unsigned int may_writepage:1; + + /* Can mapped folios be reclaimed? */ + unsigned int may_unmap:1; + + /* Can folios be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + + /* + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + +#ifdef CONFIG_LRU_GEN + /* help kswapd make better choices among multiple memcgs */ + unsigned int memcgs_need_aging:1; + unsigned long last_reclaimed; +#endif + + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate folios for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; +}; + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_folio(_folio, _base, _field) \ + do { \ + if ((_folio)->lru.prev != _base) { \ + struct folio *prev; \ + \ + prev = lru_to_folio(&(_folio->lru)); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) +#endif + +/* + * From 0 .. 200. Higher means more swappy. + */ +int vm_swappiness = 60; + +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); + +#ifdef CONFIG_MEMCG +static int shrinker_nr_max; + +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} + +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int map_size, int defer_size, + int old_map_size, int old_defer_size) +{ + struct shrinker_info *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + int size = map_size + defer_size; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = shrinker_info_protected(memcg, nid); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); + + rcu_assign_pointer(pn->shrinker_info, new); + kvfree_rcu(old, rcu); + } + + return 0; +} + +void free_shrinker_info(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct shrinker_info *info; + int nid; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); + } +} + +int alloc_shrinker_info(struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + int nid, size, ret = 0; + int map_size, defer_size = 0; + + down_write(&shrinker_rwsem); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; + for_each_node(nid) { + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); + ret = -ENOMEM; + break; + } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); + } + up_write(&shrinker_rwsem); + + return ret; +} + +static inline bool need_expand(int nr_max) +{ + return round_up(nr_max, BITS_PER_LONG) > + round_up(shrinker_nr_max, BITS_PER_LONG); +} + +static int expand_shrinker_info(int new_id) +{ + int ret = 0; + int new_nr_max = new_id + 1; + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; + struct mem_cgroup *memcg; + + if (!need_expand(new_nr_max)) + goto out; + + if (!root_mem_cgroup) + goto out; + + lockdep_assert_held(&shrinker_rwsem); + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto out; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +out: + if (!ret) + shrinker_nr_max = new_nr_max; + + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct shrinker_info *info; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, info->map); + rcu_read_unlock(); + } +} + +static DEFINE_IDR(shrinker_idr); + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + if (mem_cgroup_disabled()) + return -ENOSYS; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) { + if (expand_shrinker_info(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + } + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + lockdep_assert_held(&shrinker_rwsem); + + idr_remove(&shrinker_idr, id); +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < shrinker_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} + +static bool cgroup_reclaim(struct scan_control *sc) +{ + return sc->target_mem_cgroup; +} + +/** + * writeback_throttling_sane - is the usual dirty throttling mechanism available? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_folio_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool writeback_throttling_sane(struct scan_control *sc) +{ + if (!cgroup_reclaim(sc)) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; +#endif + return false; +} +#else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return -ENOSYS; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static bool cgroup_reclaim(struct scan_control *sc) +{ + return false; +} + +static bool writeback_throttling_sane(struct scan_control *sc) +{ + return true; +} +#endif + +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + +static bool can_demote(int nid, struct scan_control *sc) +{ + if (!numa_demotion_enabled) + return false; + if (sc && sc->no_demotion) + return false; + if (next_demotion_node(nid) == NUMA_NO_NODE) + return false; + + return true; +} + +static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, + int nid, + struct scan_control *sc) +{ + if (memcg == NULL) { + /* + * For non-memcg reclaim, is there + * space in any swap device? + */ + if (get_nr_swap_pages() > 0) + return true; + } else { + /* Is the memcg below its swap limit? */ + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) + return true; + } + + /* + * The page can not be swapped. + * + * Can it be reclaimed from this node via demotion? + */ + return can_demote(nid, sc); +} + +/* + * This misses isolated folios which are not accounted for to save counters. + * As the data only determines if reclaim or compaction continues, it is + * not expected that isolated folios will be a dominating factor. + */ +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + unsigned long nr; + + nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) + nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); + + return nr; +} + +/** + * lruvec_lru_size - Returns the number of pages on the given LRU list. + * @lruvec: lru vector + * @lru: lru to use + * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) + */ +static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + unsigned long size = 0; + int zid; + + for (zid = 0; zid <= zone_idx; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + + if (!managed_zone(zone)) + continue; + + if (!mem_cgroup_disabled()) + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + else + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); + } + return size; +} + +/* + * Add a shrinker callback to be called from the vm. + */ +static int __prealloc_shrinker(struct shrinker *shrinker) +{ + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + +void free_prealloced_shrinker(struct shrinker *shrinker) +{ +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); + shrinker->name = NULL; +#endif + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + return; + } + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} + +void register_shrinker_prepared(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} + +static int __register_shrinker(struct shrinker *shrinker) +{ + int err = __prealloc_shrinker(shrinker); + + if (err) + return err; + register_shrinker_prepared(shrinker); + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif +EXPORT_SYMBOL(register_shrinker); + +/* + * Remove one + */ +void unregister_shrinker(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry; + + if (!(shrinker->flags & SHRINKER_REGISTERED)) + return; + + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + debugfs_entry = shrinker_debugfs_remove(shrinker); + up_write(&shrinker_rwsem); + + debugfs_remove_recursive(debugfs_entry); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} +EXPORT_SYMBOL(unregister_shrinker); + +/** + * synchronize_shrinkers - Wait for all running shrinkers to complete. + * + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. + */ +void synchronize_shrinkers(void) +{ + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(synchronize_shrinkers); + +#define SHRINK_BATCH 128 + +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, int priority) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + long scanned = 0, next_deferred; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = xchg_nr_deferred(shrinker, shrinkctl); + + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } + + total_scan = nr >> priority; + total_scan += delta; + total_scan = min(total_scan, (2 * freeable)); + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + freeable, delta, total_scan, priority); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; + + cond_resched(); + } + + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. + */ + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); + + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + return freed; +} + +#ifdef CONFIG_MEMCG +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct shrinker_info *info; + unsigned long ret, freed = 0; + int i; + + if (!mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); + if (unlikely(!info)) + goto unlock; + + for_each_set_bit(i, info->map, shrinker_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(i, info->map); + continue; + } + + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_enabled() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(i, info->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, i); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* CONFIG_MEMCG */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +/** + * shrink_slab - shrink slab caches + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target + * @priority: the reclaim priority + * + * Call the shrink functions to age shrinkable caches. + * + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. + * + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. + * + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. + * + * Returns the number of reclaimed slab objects. + */ +static unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + int priority) +{ + unsigned long ret, freed = 0; + struct shrinker *shrinker; + + /* + * The root memcg might be allocated even though memcg is disabled + * via "cgroup_disable=memory" boot parameter. This could make + * mem_cgroup_is_root() return false, then just run memcg slab + * shrink, but skip global shrink. This may result in premature + * oom. + */ + if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); + + if (!down_read_trylock(&shrinker_rwsem)) + goto out; + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); + return freed; +} + +static void drop_slab_node(int nid) +{ + unsigned long freed; + int shift = 0; + + do { + struct mem_cgroup *memcg = NULL; + + if (fatal_signal_pending(current)) + return; + + freed = 0; + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } while ((freed >> shift++) > 1); +} + +void drop_slab(void) +{ + int nid; + + for_each_online_node(nid) + drop_slab_node(nid); +} + +static inline int is_page_cache_freeable(struct folio *folio) +{ + /* + * A freeable page cache folio is referenced only by the caller + * that isolated the folio, the page cache and optional filesystem + * private data at folio->private. + */ + return folio_ref_count(folio) - folio_test_private(folio) == + 1 + folio_nr_pages(folio); +} + +/* + * We detected a synchronous write error writing a folio out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the folio and once + * that folio is locked, the mapping is pinned. + * + * We're allowed to run sleeping folio_lock() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct folio *folio, int error) +{ + folio_lock(folio); + if (folio_mapping(folio) == mapping) + mapping_set_error(mapping, error); + folio_unlock(folio); +} + +static bool skip_throttle_noprogress(pg_data_t *pgdat) +{ + int reclaimable = 0, write_pending = 0; + int i; + + /* + * If kswapd is disabled, reschedule if necessary but do not + * throttle as the system is likely near OOM. + */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + + /* + * If there are a lot of dirty/writeback folios then do not + * throttle as throttling will occur when the folios cycle + * towards the end of the LRU if still under writeback. + */ + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + reclaimable += zone_reclaimable_pages(zone); + write_pending += zone_page_state_snapshot(zone, + NR_ZONE_WRITE_PENDING); + } + if (2 * write_pending <= reclaimable) + return true; + + return false; +} + +void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) +{ + wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason]; + long timeout, ret; + DEFINE_WAIT(wait); + + /* + * Do not throttle IO workers, kthreads other than kswapd or + * workqueues. They may be required for reclaim to make + * forward progress (e.g. journalling workqueues or kthreads). + */ + if (!current_is_kswapd() && + current->flags & (PF_IO_WORKER|PF_KTHREAD)) { + cond_resched(); + return; + } + + /* + * These figures are pulled out of thin air. + * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many + * parallel reclaimers which is a short-lived event so the timeout is + * short. Failing to make progress or waiting on writeback are + * potentially long-lived events so use a longer timeout. This is shaky + * logic as a failure to make progress could be due to anything from + * writeback to a slow device to excessive referenced folios at the tail + * of the inactive LRU. + */ + switch(reason) { + case VMSCAN_THROTTLE_WRITEBACK: + timeout = HZ/10; + + if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) { + WRITE_ONCE(pgdat->nr_reclaim_start, + node_page_state(pgdat, NR_THROTTLED_WRITTEN)); + } + + break; + case VMSCAN_THROTTLE_CONGESTED: + fallthrough; + case VMSCAN_THROTTLE_NOPROGRESS: + if (skip_throttle_noprogress(pgdat)) { + cond_resched(); + return; + } + + timeout = 1; + + break; + case VMSCAN_THROTTLE_ISOLATED: + timeout = HZ/50; + break; + default: + WARN_ON_ONCE(1); + timeout = HZ; + break; + } + + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = schedule_timeout(timeout); + finish_wait(wqh, &wait); + + if (reason == VMSCAN_THROTTLE_WRITEBACK) + atomic_dec(&pgdat->nr_writeback_throttled); + + trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout), + jiffies_to_usecs(timeout - ret), + reason); +} + +/* + * Account for folios written if tasks are throttled waiting on dirty + * folios to clean. If enough folios have been cleaned since throttling + * started then wakeup the throttled tasks. + */ +void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, + int nr_throttled) +{ + unsigned long nr_written; + + node_stat_add_folio(folio, NR_THROTTLED_WRITTEN); + + /* + * This is an inaccurate read as the per-cpu deltas may not + * be synchronised. However, given that the system is + * writeback throttled, it is not worth taking the penalty + * of getting an accurate count. At worst, the throttle + * timeout guarantees forward progress. + */ + nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) - + READ_ONCE(pgdat->nr_reclaim_start); + + if (nr_written > SWAP_CLUSTER_MAX * nr_throttled) + wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]); +} + +/* possible outcome of pageout() */ +typedef enum { + /* failed to write folio out, folio is locked */ + PAGE_KEEP, + /* move folio to the active list, folio is locked */ + PAGE_ACTIVATE, + /* folio has been sent to the disk successfully, folio is unlocked */ + PAGE_SUCCESS, + /* folio is clean and locked */ + PAGE_CLEAN, +} pageout_t; + +/* + * pageout is called by shrink_folio_list() for each dirty folio. + * Calls ->writepage(). + */ +static pageout_t pageout(struct folio *folio, struct address_space *mapping, + struct swap_iocb **plug) +{ + /* + * If the folio is dirty, only perform writeback if that write + * will be non-blocking. To prevent this allocation from being + * stalled by pagecache activity. But note that there may be + * stalls if we need to run get_block(). We could test + * PagePrivate for that. + * + * If this process is currently in __generic_file_write_iter() against + * this folio's queue, we can perform writeback even if that + * will block. + * + * If the folio is swapcache, write it back even if that would + * block, for some throttling. This happens by accident, because + * swap_backing_dev_info is bust: it doesn't reflect the + * congestion state of the swapdevs. Easy to fix, if needed. + */ + if (!is_page_cache_freeable(folio)) + return PAGE_KEEP; + if (!mapping) { + /* + * Some data journaling orphaned folios can have + * folio->mapping == NULL while being dirty with clean buffers. + */ + if (folio_test_private(folio)) { + if (try_to_free_buffers(folio)) { + folio_clear_dirty(folio); + pr_info("%s: orphaned folio\n", __func__); + return PAGE_CLEAN; + } + } + return PAGE_KEEP; + } + if (mapping->a_ops->writepage == NULL) + return PAGE_ACTIVATE; + + if (folio_clear_dirty_for_io(folio)) { + int res; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = SWAP_CLUSTER_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + .for_reclaim = 1, + .swap_plug = plug, + }; + + folio_set_reclaim(folio); + res = mapping->a_ops->writepage(&folio->page, &wbc); + if (res < 0) + handle_write_error(mapping, folio, res); + if (res == AOP_WRITEPAGE_ACTIVATE) { + folio_clear_reclaim(folio); + return PAGE_ACTIVATE; + } + + if (!folio_test_writeback(folio)) { + /* synchronous write or broken a_ops? */ + folio_clear_reclaim(folio); + } + trace_mm_vmscan_write_folio(folio); + node_stat_add_folio(folio, NR_VMSCAN_WRITE); + return PAGE_SUCCESS; + } + + return PAGE_CLEAN; +} + +/* + * Same as remove_mapping, but if the folio is removed from the mapping, it + * gets returned with a refcount of 0. + */ +static int __remove_mapping(struct address_space *mapping, struct folio *folio, + bool reclaimed, struct mem_cgroup *target_memcg) +{ + int refcount; + void *shadow = NULL; + + BUG_ON(!folio_test_locked(folio)); + BUG_ON(mapping != folio_mapping(folio)); + + if (!folio_test_swapcache(folio)) + spin_lock(&mapping->host->i_lock); + xa_lock_irq(&mapping->i_pages); + /* + * The non racy check for a busy folio. + * + * Must be careful with the order of the tests. When someone has + * a ref to the folio, it may be possible that they dirty it then + * drop the reference. So if the dirty flag is tested before the + * refcount here, then the following race may occur: + * + * get_user_pages(&page); + * [user mapping goes away] + * write_to(page); + * !folio_test_dirty(folio) [good] + * folio_set_dirty(folio); + * folio_put(folio); + * !refcount(folio) [good, discard it] + * + * [oops, our write_to data is lost] + * + * Reversing the order of the tests ensures such a situation cannot + * escape unnoticed. The smp_rmb is needed to ensure the folio->flags + * load is not satisfied before that of folio->_refcount. + * + * Note that if the dirty flag is always set via folio_mark_dirty, + * and thus under the i_pages lock, then this ordering is not required. + */ + refcount = 1 + folio_nr_pages(folio); + if (!folio_ref_freeze(folio, refcount)) + goto cannot_free; + /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ + if (unlikely(folio_test_dirty(folio))) { + folio_ref_unfreeze(folio, refcount); + goto cannot_free; + } + + if (folio_test_swapcache(folio)) { + swp_entry_t swap = folio_swap_entry(folio); + + /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ + if (reclaimed && !mapping_exiting(mapping)) + shadow = workingset_eviction(folio, target_memcg); + mem_cgroup_swapout(folio, swap); + __delete_from_swap_cache(folio, swap, shadow); + xa_unlock_irq(&mapping->i_pages); + put_swap_folio(folio, swap); + } else { + void (*free_folio)(struct folio *); + + free_folio = mapping->a_ops->free_folio; + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optimization, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + * + * We also don't store shadows for DAX mappings because the + * only page cache folios found in these are zero pages + * covering holes, and because we don't want to mix DAX + * exceptional entries and shadow exceptional entries in the + * same address_space. + */ + if (reclaimed && folio_is_file_lru(folio) && + !mapping_exiting(mapping) && !dax_mapping(mapping)) + shadow = workingset_eviction(folio, target_memcg); + __filemap_remove_folio(folio, shadow); + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + + if (free_folio) + free_folio(folio); + } + + return 1; + +cannot_free: + xa_unlock_irq(&mapping->i_pages); + if (!folio_test_swapcache(folio)) + spin_unlock(&mapping->host->i_lock); + return 0; +} + +/** + * remove_mapping() - Attempt to remove a folio from its mapping. + * @mapping: The address space. + * @folio: The folio to remove. + * + * If the folio is dirty, under writeback or if someone else has a ref + * on it, removal will fail. + * Return: The number of pages removed from the mapping. 0 if the folio + * could not be removed. + * Context: The caller should have a single refcount on the folio and + * hold its lock. + */ +long remove_mapping(struct address_space *mapping, struct folio *folio) +{ + if (__remove_mapping(mapping, folio, false, NULL)) { + /* + * Unfreezing the refcount with 1 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + folio_ref_unfreeze(folio, 1); + return folio_nr_pages(folio); + } + return 0; +} + +/** + * folio_putback_lru - Put previously isolated folio onto appropriate LRU list. + * @folio: Folio to be returned to an LRU list. + * + * Add previously isolated @folio to appropriate LRU list. + * The folio may still be unevictable for other reasons. + * + * Context: lru_lock must not be held, interrupts must be enabled. + */ +void folio_putback_lru(struct folio *folio) +{ + folio_add_lru(folio); + folio_put(folio); /* drop ref from isolate */ +} + +enum folio_references { + FOLIOREF_RECLAIM, + FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_KEEP, + FOLIOREF_ACTIVATE, +}; + +static enum folio_references folio_check_references(struct folio *folio, + struct scan_control *sc) +{ + int referenced_ptes, referenced_folio; + unsigned long vm_flags; + + referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, + &vm_flags); + referenced_folio = folio_test_clear_referenced(folio); + + /* + * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. + * Let the folio, now marked Mlocked, be moved to the unevictable list. + */ + if (vm_flags & VM_LOCKED) + return FOLIOREF_ACTIVATE; + + /* rmap lock contention: rotate */ + if (referenced_ptes == -1) + return FOLIOREF_KEEP; + + if (referenced_ptes) { + /* + * All mapped folios start out with page table + * references from the instantiating fault, so we need + * to look twice if a mapped file/anon folio is used more + * than once. + * + * Mark it and spare it for another trip around the + * inactive list. Another page table reference will + * lead to its activation. + * + * Note: the mark is set for activated folios as well + * so that recently deactivated but used folios are + * quickly recovered. + */ + folio_set_referenced(folio); + + if (referenced_folio || referenced_ptes > 1) + return FOLIOREF_ACTIVATE; + + /* + * Activate file-backed executable folios after first usage. + */ + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) + return FOLIOREF_ACTIVATE; + + return FOLIOREF_KEEP; + } + + /* Reclaim if clean, defer dirty folios to writeback */ + if (referenced_folio && folio_is_file_lru(folio)) + return FOLIOREF_RECLAIM_CLEAN; + + return FOLIOREF_RECLAIM; +} + +/* Check if a folio is dirty or under writeback */ +static void folio_check_dirty_writeback(struct folio *folio, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous folios are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them. + * MADV_FREE anonymous folios are put into inactive file list too. + * They could be mistakenly treated as file lru. So further anon + * test is needed. + */ + if (!folio_is_file_lru(folio) || + (folio_test_anon(folio) && !folio_test_swapbacked(folio))) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the folio flags are accurate */ + *dirty = folio_test_dirty(folio); + *writeback = folio_test_writeback(folio); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!folio_test_private(folio)) + return; + + mapping = folio_mapping(folio); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); +} + +static struct page *alloc_demote_page(struct page *page, unsigned long private) +{ + struct page *target_page; + nodemask_t *allowed_mask; + struct migration_target_control *mtc; + + mtc = (struct migration_target_control *)private; + + allowed_mask = mtc->nmask; + /* + * make sure we allocate from the target node first also trying to + * demote or reclaim pages from the target node via kswapd if we are + * low on free memory on target node. If we don't do this and if + * we have free memory on the slower(lower) memtier, we would start + * allocating pages from slower(lower) memory tiers without even forcing + * a demotion of cold pages from the target memtier. This can result + * in the kernel placing hot pages in slower(lower) memory tiers. + */ + mtc->nmask = NULL; + mtc->gfp_mask |= __GFP_THISNODE; + target_page = alloc_migration_target(page, (unsigned long)mtc); + if (target_page) + return target_page; + + mtc->gfp_mask &= ~__GFP_THISNODE; + mtc->nmask = allowed_mask; + + return alloc_migration_target(page, (unsigned long)mtc); +} + +/* + * Take folios on @demote_folios and attempt to demote them to another node. + * Folios which are not demoted are left on @demote_folios. + */ +static unsigned int demote_folio_list(struct list_head *demote_folios, + struct pglist_data *pgdat) +{ + int target_nid = next_demotion_node(pgdat->node_id); + unsigned int nr_succeeded; + nodemask_t allowed_mask; + + struct migration_target_control mtc = { + /* + * Allocate from 'node', or fail quickly and quietly. + * When this happens, 'page' will likely just be discarded + * instead of migrated. + */ + .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN | + __GFP_NOMEMALLOC | GFP_NOWAIT, + .nid = target_nid, + .nmask = &allowed_mask + }; + + if (list_empty(demote_folios)) + return 0; + + if (target_nid == NUMA_NO_NODE) + return 0; + + node_get_allowed_targets(pgdat, &allowed_mask); + + /* Demotion ignores all cpuset and mempolicy settings */ + migrate_pages(demote_folios, alloc_demote_page, NULL, + (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, + &nr_succeeded); + + if (current_is_kswapd()) + __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); + else + __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + + return nr_succeeded; +} + +static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_FS) + return true; + if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO)) + return false; + /* + * We can "enter_fs" for swap-cache with only __GFP_IO + * providing this isn't SWP_FS_OPS. + * ->flags can be updated non-atomicially (scan_swap_map_slots), + * but that will never affect SWP_FS_OPS, so the data_race + * is safe. + */ + return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); +} + +/* + * shrink_folio_list() returns the number of reclaimed pages + */ +static unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references) +{ + LIST_HEAD(ret_folios); + LIST_HEAD(free_folios); + LIST_HEAD(demote_folios); + unsigned int nr_reclaimed = 0; + unsigned int pgactivate = 0; + bool do_demote_pass; + struct swap_iocb *plug = NULL; + + memset(stat, 0, sizeof(*stat)); + cond_resched(); + do_demote_pass = can_demote(pgdat->node_id, sc); + +retry: + while (!list_empty(folio_list)) { + struct address_space *mapping; + struct folio *folio; + enum folio_references references = FOLIOREF_RECLAIM; + bool dirty, writeback; + unsigned int nr_pages; + + cond_resched(); + + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + + if (!folio_trylock(folio)) + goto keep; + + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + + nr_pages = folio_nr_pages(folio); + + /* Account the number of base pages */ + sc->nr_scanned += nr_pages; + + if (unlikely(!folio_evictable(folio))) + goto activate_locked; + + if (!sc->may_unmap && folio_mapped(folio)) + goto keep_locked; + + /* folio_update_gen() tried to promote this page? */ + if (lru_gen_enabled() && !ignore_references && + folio_mapped(folio) && folio_test_referenced(folio)) + goto keep_locked; + + /* + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing + * folios if the tail of the LRU is all dirty unqueued folios. + */ + folio_check_dirty_writeback(folio, &dirty, &writeback); + if (dirty || writeback) + stat->nr_dirty += nr_pages; + + if (dirty && !writeback) + stat->nr_unqueued_dirty += nr_pages; + + /* + * Treat this folio as congested if folios are cycling + * through the LRU so quickly that the folios marked + * for immediate reclaim are making it to the end of + * the LRU a second time. + */ + if (writeback && folio_test_reclaim(folio)) + stat->nr_congested += nr_pages; + + /* + * If a folio at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number + * of folios under writeback and this folio has both + * the writeback and reclaim flags set, then it + * indicates that folios are being queued for I/O but + * are being recycled through the LRU before the I/O + * can complete. Waiting on the folio itself risks an + * indefinite stall if it is impossible to writeback + * the folio due to I/O error or disconnected storage + * so instead note that the LRU is being scanned too + * quickly and the caller can stall after the folio + * list has been processed. + * + * 2) Global or new memcg reclaim encounters a folio that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the folio for immediate + * reclaim and continue scanning. + * + * Require may_enter_fs() because we would wait on fs, which + * may not have submitted I/O yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a folio for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * 3) Legacy memcg encounters a folio that already has the + * reclaim flag set. memcg does not have any dirty folio + * throttling so we could easily OOM just because too many + * folios are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + * + * In cases 1) and 2) we activate the folios to get them out of + * the way while we continue scanning for clean folios on the + * inactive list and refilling from the active list. The + * observation here is that waiting for disk writes is more + * expensive than potentially causing reloads down the line. + * Since they're marked for immediate reclaim, they won't put + * memory pressure on the cache working set any longer than it + * takes to write them to disk. + */ + if (folio_test_writeback(folio)) { + /* Case 1 above */ + if (current_is_kswapd() && + folio_test_reclaim(folio) && + test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { + stat->nr_immediate += nr_pages; + goto activate_locked; + + /* Case 2 above */ + } else if (writeback_throttling_sane(sc) || + !folio_test_reclaim(folio) || + !may_enter_fs(folio, sc->gfp_mask)) { + /* + * This is slightly racy - + * folio_end_writeback() might have + * just cleared the reclaim flag, then + * setting the reclaim flag here ends up + * interpreted as the readahead flag - but + * that does not matter enough to care. + * What we do want is for this folio to + * have the reclaim flag set next time + * memcg reclaim reaches the tests above, + * so it will then wait for writeback to + * avoid OOM; and it's also appropriate + * in global reclaim. + */ + folio_set_reclaim(folio); + stat->nr_writeback += nr_pages; + goto activate_locked; + + /* Case 3 above */ + } else { + folio_unlock(folio); + folio_wait_writeback(folio); + /* then go back and try same folio again */ + list_add_tail(&folio->lru, folio_list); + continue; + } + } + + if (!ignore_references) + references = folio_check_references(folio, sc); + + switch (references) { + case FOLIOREF_ACTIVATE: + goto activate_locked; + case FOLIOREF_KEEP: + stat->nr_ref_keep += nr_pages; + goto keep_locked; + case FOLIOREF_RECLAIM: + case FOLIOREF_RECLAIM_CLEAN: + ; /* try to reclaim the folio below */ + } + + /* + * Before reclaiming the folio, try to relocate + * its contents to another node. + */ + if (do_demote_pass && + (thp_migration_supported() || !folio_test_large(folio))) { + list_add(&folio->lru, &demote_folios); + folio_unlock(folio); + continue; + } + + /* + * Anonymous process memory has backing store? + * Try to allocate it some swap space here. + * Lazyfree folio could be freed directly + */ + if (folio_test_anon(folio) && folio_test_swapbacked(folio)) { + if (!folio_test_swapcache(folio)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (folio_maybe_dma_pinned(folio)) + goto keep_locked; + if (folio_test_large(folio)) { + /* cannot split folio, skip it */ + if (!can_split_folio(folio, NULL)) + goto activate_locked; + /* + * Split folios without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!folio_entire_mapcount(folio) && + split_folio_to_list(folio, + folio_list)) + goto activate_locked; + } + if (!add_to_swap(folio)) { + if (!folio_test_large(folio)) + goto activate_locked_split; + /* Fallback to swap normal pages */ + if (split_folio_to_list(folio, + folio_list)) + goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_vm_event(THP_SWPOUT_FALLBACK); +#endif + if (!add_to_swap(folio)) + goto activate_locked_split; + } + } + } else if (folio_test_swapbacked(folio) && + folio_test_large(folio)) { + /* Split shmem folio */ + if (split_folio_to_list(folio, folio_list)) + goto keep_locked; + } + + /* + * If the folio was split above, the tail pages will make + * their own pass through this function and be accounted + * then. + */ + if ((nr_pages > 1) && !folio_test_large(folio)) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } + + /* + * The folio is mapped into the page tables of one or more + * processes. Try to unmap it here. + */ + if (folio_mapped(folio)) { + enum ttu_flags flags = TTU_BATCH_FLUSH; + bool was_swapbacked = folio_test_swapbacked(folio); + + if (folio_test_pmd_mappable(folio)) + flags |= TTU_SPLIT_HUGE_PMD; + + try_to_unmap(folio, flags); + if (folio_mapped(folio)) { + stat->nr_unmap_fail += nr_pages; + if (!was_swapbacked && + folio_test_swapbacked(folio)) + stat->nr_lazyfree_fail += nr_pages; + goto activate_locked; + } + } + + /* + * Folio is unmapped now so it cannot be newly pinned anymore. + * No point in trying to reclaim folio if it is pinned. + * Furthermore we don't want to reclaim underlying fs metadata + * if the folio is pinned and thus potentially modified by the + * pinning process as that may upset the filesystem. + */ + if (folio_maybe_dma_pinned(folio)) + goto activate_locked; + + mapping = folio_mapping(folio); + if (folio_test_dirty(folio)) { + /* + * Only kswapd can writeback filesystem folios + * to avoid risk of stack overflow. But avoid + * injecting inefficient single-folio I/O into + * flusher writeback as much as possible: only + * write folios when we've encountered many + * dirty folios, and when we've already scanned + * the rest of the LRU for clean folios and see + * the same dirty folios again (with the reclaim + * flag set). + */ + if (folio_is_file_lru(folio) && + (!current_is_kswapd() || + !folio_test_reclaim(folio) || + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { + /* + * Immediately reclaim when written back. + * Similar in principle to deactivate_page() + * except we already have the folio isolated + * and know it's dirty + */ + node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE, + nr_pages); + folio_set_reclaim(folio); + + goto activate_locked; + } + + if (references == FOLIOREF_RECLAIM_CLEAN) + goto keep_locked; + if (!may_enter_fs(folio, sc->gfp_mask)) + goto keep_locked; + if (!sc->may_writepage) + goto keep_locked; + + /* + * Folio is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after I/O + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); + switch (pageout(folio, mapping, &plug)) { + case PAGE_KEEP: + goto keep_locked; + case PAGE_ACTIVATE: + goto activate_locked; + case PAGE_SUCCESS: + stat->nr_pageout += nr_pages; + + if (folio_test_writeback(folio)) + goto keep; + if (folio_test_dirty(folio)) + goto keep; + + /* + * A synchronous write - probably a ramdisk. Go + * ahead and try to reclaim the folio. + */ + if (!folio_trylock(folio)) + goto keep; + if (folio_test_dirty(folio) || + folio_test_writeback(folio)) + goto keep_locked; + mapping = folio_mapping(folio); + fallthrough; + case PAGE_CLEAN: + ; /* try to free the folio below */ + } + } + + /* + * If the folio has buffers, try to free the buffer + * mappings associated with this folio. If we succeed + * we try to free the folio as well. + * + * We do this even if the folio is dirty. + * filemap_release_folio() does not perform I/O, but it + * is possible for a folio to have the dirty flag set, + * but it is actually clean (all its buffers are clean). + * This happens if the buffers were written out directly, + * with submit_bh(). ext3 will do this, as well as + * the blockdev mapping. filemap_release_folio() will + * discover that cleanness and will drop the buffers + * and mark the folio clean - it can be freed. + * + * Rarely, folios can have buffers and no ->mapping. + * These are the folios which were not successfully + * invalidated in truncate_cleanup_folio(). We try to + * drop those buffers here and if that worked, and the + * folio is no longer mapped into process address space + * (refcount == 1) it can be freed. Otherwise, leave + * the folio on the LRU so it is swappable. + */ + if (folio_needs_release(folio)) { + if (!filemap_release_folio(folio, sc->gfp_mask)) + goto activate_locked; + if (!mapping && folio_ref_count(folio) == 1) { + folio_unlock(folio); + if (folio_put_testzero(folio)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this folio shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed += nr_pages; + continue; + } + } + } + + if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { + /* follow __remove_mapping for reference */ + if (!folio_ref_freeze(folio, 1)) + goto keep_locked; + /* + * The folio has only one reference left, which is + * from the isolation. After the caller puts the + * folio back on the lru and drops the reference, the + * folio will be freed anyway. It doesn't matter + * which lru it goes on. So we don't bother checking + * the dirty flag here. + */ + count_vm_events(PGLAZYFREED, nr_pages); + count_memcg_folio_events(folio, PGLAZYFREED, nr_pages); + } else if (!mapping || !__remove_mapping(mapping, folio, true, + sc->target_mem_cgroup)) + goto keep_locked; + + folio_unlock(folio); +free_it: + /* + * Folio may get swapped out as a whole, need to account + * all pages in it. + */ + nr_reclaimed += nr_pages; + + /* + * Is there need to periodically free_folio_list? It would + * appear not as the counts should be low + */ + if (unlikely(folio_test_large(folio))) + destroy_large_folio(folio); + else + list_add(&folio->lru, &free_folios); + continue; + +activate_locked_split: + /* + * The tail pages that are failed to add into swap cache + * reach here. Fixup nr_scanned and nr_pages. + */ + if (nr_pages > 1) { + sc->nr_scanned -= (nr_pages - 1); + nr_pages = 1; + } +activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (folio_test_swapcache(folio) && + (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) + folio_free_swap(folio); + VM_BUG_ON_FOLIO(folio_test_active(folio), folio); + if (!folio_test_mlocked(folio)) { + int type = folio_is_file_lru(folio); + folio_set_active(folio); + stat->nr_activate[type] += nr_pages; + count_memcg_folio_events(folio, PGACTIVATE, nr_pages); + } +keep_locked: + folio_unlock(folio); +keep: + list_add(&folio->lru, &ret_folios); + VM_BUG_ON_FOLIO(folio_test_lru(folio) || + folio_test_unevictable(folio), folio); + } + /* 'folio_list' is always empty here */ + + /* Migrate folios selected for demotion */ + nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + /* Folios that could not be demoted are still in @demote_folios */ + if (!list_empty(&demote_folios)) { + /* Folios which weren't demoted go back on @folio_list for retry: */ + list_splice_init(&demote_folios, folio_list); + do_demote_pass = false; + goto retry; + } + + pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; + + mem_cgroup_uncharge_list(&free_folios); + try_to_unmap_flush(); + free_unref_page_list(&free_folios); + + list_splice(&ret_folios, folio_list); + count_vm_events(PGACTIVATE, pgactivate); + + if (plug) + swap_write_unplug(plug); + return nr_reclaimed; +} + +unsigned int reclaim_clean_pages_from_list(struct zone *zone, + struct list_head *folio_list) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_unmap = 1, + }; + struct reclaim_stat stat; + unsigned int nr_reclaimed; + struct folio *folio, *next; + LIST_HEAD(clean_folios); + unsigned int noreclaim_flag; + + list_for_each_entry_safe(folio, next, folio_list, lru) { + if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && + !folio_test_dirty(folio) && !__folio_test_movable(folio) && + !folio_test_unevictable(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &clean_folios); + } + } + + /* + * We should be safe here since we are only dealing with file pages and + * we are not kswapd and therefore cannot write dirty file pages. But + * call memalloc_noreclaim_save() anyway, just in case these conditions + * change in the future. + */ + noreclaim_flag = memalloc_noreclaim_save(); + nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, + &stat, true); + memalloc_noreclaim_restore(noreclaim_flag); + + list_splice(&clean_folios, folio_list); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -(long)nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -(long)stat.nr_lazyfree_fail); + return nr_reclaimed; +} + +/* + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a sanity check. + */ +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; + + update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + } + +} + +/* + * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. + * + * lruvec->lru_lock is heavily contended. Some of the functions that + * shrink the lists perform better by taking out a batch of pages + * and working on them outside the LRU lock. + * + * For pagecache intensive workloads, this function is the hottest + * spot in the kernel (apart from copy_*_user functions). + * + * Lru_lock must be held before calling this function. + * + * @nr_to_scan: The number of eligible pages to look through on the list. + * @lruvec: The LRU vector to pull pages from. + * @dst: The temp list to put pages on to. + * @nr_scanned: The number of pages that were scanned. + * @sc: The scan_control struct for this reclaim session + * @lru: LRU list id for isolating + * + * returns how many pages were moved onto *@dst. + */ +static unsigned long isolate_lru_folios(unsigned long nr_to_scan, + struct lruvec *lruvec, struct list_head *dst, + unsigned long *nr_scanned, struct scan_control *sc, + enum lru_list lru) +{ + struct list_head *src = &lruvec->lists[lru]; + unsigned long nr_taken = 0; + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long skipped = 0; + unsigned long scan, total_scan, nr_pages; + LIST_HEAD(folios_skipped); + + total_scan = 0; + scan = 0; + while (scan < nr_to_scan && !list_empty(src)) { + struct list_head *move_to = src; + struct folio *folio; + + folio = lru_to_folio(src); + prefetchw_prev_lru_folio(folio, src, flags); + + nr_pages = folio_nr_pages(folio); + total_scan += nr_pages; + + if (folio_zonenum(folio) > sc->reclaim_idx) { + nr_skipped[folio_zonenum(folio)] += nr_pages; + move_to = &folios_skipped; + goto move; + } + + /* + * Do not count skipped folios because that makes the function + * return with no isolated folios if the LRU mostly contains + * ineligible folios. This causes the VM to not reclaim any + * folios, triggering a premature OOM. + * Account all pages in a folio. + */ + scan += nr_pages; + + if (!folio_test_lru(folio)) + goto move; + if (!sc->may_unmap && folio_mapped(folio)) + goto move; + + /* + * Be careful not to clear the lru flag until after we're + * sure the folio is not being freed elsewhere -- the + * folio release code relies on it. + */ + if (unlikely(!folio_try_get(folio))) + goto move; + + if (!folio_test_clear_lru(folio)) { + /* Another thread is already isolating this folio */ + folio_put(folio); + goto move; + } + + nr_taken += nr_pages; + nr_zone_taken[folio_zonenum(folio)] += nr_pages; + move_to = dst; +move: + list_move(&folio->lru, move_to); + } + + /* + * Splice any skipped folios to the start of the LRU list. Note that + * this disrupts the LRU order when reclaiming for lower zones but + * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX + * scanning would soon rescan the same folios to skip and waste lots + * of cpu cycles. + */ + if (!list_empty(&folios_skipped)) { + int zid; + + list_splice(&folios_skipped, src); + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_skipped[zid]) + continue; + + __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); + skipped += nr_skipped[zid]; + } + } + *nr_scanned = total_scan; + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, + total_scan, skipped, nr_taken, + sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); + update_lru_sizes(lruvec, lru, nr_zone_taken); + return nr_taken; +} + +/** + * folio_isolate_lru() - Try to isolate a folio from its LRU list. + * @folio: Folio to isolate from its LRU list. + * + * Isolate a @folio from an LRU list and adjust the vmstat statistic + * corresponding to whatever LRU list the folio was on. + * + * The folio will have its LRU flag cleared. If it was found on the + * active list, it will have the Active flag set. If it was found on the + * unevictable list, it will have the Unevictable flag set. These flags + * may need to be cleared by the caller before letting the page go. + * + * Context: + * + * (1) Must be called with an elevated refcount on the folio. This is a + * fundamental difference from isolate_lru_folios() (which is called + * without a stable reference). + * (2) The lru_lock must not be held. + * (3) Interrupts must be enabled. + * + * Return: 0 if the folio was removed from an LRU list. + * -EBUSY if the folio was not on an LRU list. + */ +int folio_isolate_lru(struct folio *folio) +{ + int ret = -EBUSY; + + VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio); + + if (folio_test_clear_lru(folio)) { + struct lruvec *lruvec; + + folio_get(folio); + lruvec = folio_lruvec_lock_irq(folio); + lruvec_del_folio(lruvec, folio); + unlock_page_lruvec_irq(lruvec); + ret = 0; + } + + return ret; +} + +/* + * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and + * then get rescheduled. When there are massive number of tasks doing page + * allocation, such sleeping direct reclaimers may keep piling up on each CPU, + * the LRU list will go small and be scanned faster than necessary, leading to + * unnecessary swapping, thrashing and OOM. + */ +static int too_many_isolated(struct pglist_data *pgdat, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + bool too_many; + + if (current_is_kswapd()) + return 0; + + if (!writeback_throttling_sane(sc)) + return 0; + + if (file) { + inactive = node_page_state(pgdat, NR_INACTIVE_FILE); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE); + } else { + inactive = node_page_state(pgdat, NR_INACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_ANON); + } + + /* + * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they + * won't get blocked by normal direct-reclaimers, forming a circular + * deadlock. + */ + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) + inactive >>= 3; + + too_many = isolated > inactive; + + /* Wake up tasks throttled due to too_many_isolated. */ + if (!too_many) + wake_throttle_isolated(pgdat); + + return too_many; +} + +/* + * move_folios_to_lru() moves folios from private @list to appropriate LRU list. + * On return, @list is reused as a list of folios to be freed by the caller. + * + * Returns the number of pages moved to the given lruvec. + */ +static unsigned int move_folios_to_lru(struct lruvec *lruvec, + struct list_head *list) +{ + int nr_pages, nr_moved = 0; + LIST_HEAD(folios_to_free); + + while (!list_empty(list)) { + struct folio *folio = lru_to_folio(list); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + list_del(&folio->lru); + if (unlikely(!folio_evictable(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + folio_putback_lru(folio); + spin_lock_irq(&lruvec->lru_lock); + continue; + } + + /* + * The folio_set_lru needs to be kept here for list integrity. + * Otherwise: + * #0 move_folios_to_lru #1 release_pages + * if (!folio_put_testzero()) + * if (folio_put_testzero()) + * !lru //skip lru_lock + * folio_set_lru() + * list_add(&folio->lru,) + * list_add(&folio->lru,) + */ + folio_set_lru(folio); + + if (unlikely(folio_put_testzero(folio))) { + __folio_clear_lru_flags(folio); + + if (unlikely(folio_test_large(folio))) { + spin_unlock_irq(&lruvec->lru_lock); + destroy_large_folio(folio); + spin_lock_irq(&lruvec->lru_lock); + } else + list_add(&folio->lru, &folios_to_free); + + continue; + } + + /* + * All pages were isolated from the same lruvec (and isolation + * inhibits memcg migration). + */ + VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + lruvec_add_folio(lruvec, folio); + nr_pages = folio_nr_pages(folio); + nr_moved += nr_pages; + if (folio_test_active(folio)) + workingset_age_nonresident(lruvec, nr_pages); + } + + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&folios_to_free, list); + + return nr_moved; +} + +/* + * If a kernel thread (such as nfsd for loop-back mounts) services a backing + * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case + * we should not throttle. Otherwise it is safe to do so. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LOCAL_THROTTLE); +} + +/* + * shrink_inactive_list() is a helper for shrink_node(). It returns the number + * of reclaimed pages + */ +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, + enum lru_list lru) +{ + LIST_HEAD(folio_list); + unsigned long nr_scanned; + unsigned int nr_reclaimed = 0; + unsigned long nr_taken; + struct reclaim_stat stat; + bool file = is_file_lru(lru); + enum vm_event_item item; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + bool stalled = false; + + while (unlikely(too_many_isolated(pgdat, file, sc))) { + if (stalled) + return 0; + + /* wait a bit for the reclaimer. */ + stalled = true; + reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + + lru_add_drain(); + + spin_lock_irq(&lruvec->lru_lock); + + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, + &nr_scanned, sc, lru); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + __count_vm_events(PGSCAN_ANON + file, nr_scanned); + + spin_unlock_irq(&lruvec->lru_lock); + + if (nr_taken == 0) + return 0; + + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); + + spin_lock_irq(&lruvec->lru_lock); + move_folios_to_lru(lruvec, &folio_list); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, nr_reclaimed); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); + spin_unlock_irq(&lruvec->lru_lock); + + lru_note_cost(lruvec, file, stat.nr_pageout); + mem_cgroup_uncharge_list(&folio_list); + free_unref_page_list(&folio_list); + + /* + * If dirty folios are scanned that are not queued for IO, it + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty folios to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty folios grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. + */ + if (stat.nr_unqueued_dirty == nr_taken) { + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; + + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, &stat, sc->priority, file); + return nr_reclaimed; +} + +/* + * shrink_active_list() moves folios from the active LRU to the inactive LRU. + * + * We move them the other way if the folio is referenced by one or more + * processes. + * + * If the folios are mostly unmapped, the processing is fast and it is + * appropriate to hold lru_lock across the whole operation. But if + * the folios are mapped, the processing is slow (folio_referenced()), so + * we should drop lru_lock around each folio. It's impossible to balance + * this, so instead we remove the folios from the LRU while processing them. + * It is safe to rely on the active flag against the non-LRU folios in here + * because nobody will play with that bit on a non-LRU folio. + * + * The downside is that we have to touch folio->_refcount against each folio. + * But we had to alter folio->flags anyway. + */ +static void shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc, + enum lru_list lru) +{ + unsigned long nr_taken; + unsigned long nr_scanned; + unsigned long vm_flags; + LIST_HEAD(l_hold); /* The folios which were snipped off */ + LIST_HEAD(l_active); + LIST_HEAD(l_inactive); + unsigned nr_deactivate, nr_activate; + unsigned nr_rotated = 0; + int file = is_file_lru(lru); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + lru_add_drain(); + + spin_lock_irq(&lruvec->lru_lock); + + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, + &nr_scanned, sc, lru); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); + + if (!cgroup_reclaim(sc)) + __count_vm_events(PGREFILL, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + + spin_unlock_irq(&lruvec->lru_lock); + + while (!list_empty(&l_hold)) { + struct folio *folio; + + cond_resched(); + folio = lru_to_folio(&l_hold); + list_del(&folio->lru); + + if (unlikely(!folio_evictable(folio))) { + folio_putback_lru(folio); + continue; + } + + if (unlikely(buffer_heads_over_limit)) { + if (folio_needs_release(folio) && + folio_trylock(folio)) { + filemap_release_folio(folio, 0); + folio_unlock(folio); + } + } + + /* Referenced or rmap lock contention: rotate */ + if (folio_referenced(folio, 0, sc->target_mem_cgroup, + &vm_flags) != 0) { + /* + * Identify referenced, file-backed active folios and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon folios + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC folios, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { + nr_rotated += folio_nr_pages(folio); + list_add(&folio->lru, &l_active); + continue; + } + } + + folio_clear_active(folio); /* we are de-activating */ + folio_set_workingset(folio); + list_add(&folio->lru, &l_inactive); + } + + /* + * Move folios back to the lru list. + */ + spin_lock_irq(&lruvec->lru_lock); + + nr_activate = move_folios_to_lru(lruvec, &l_active); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); + /* Keep all free folios in l_active list */ + list_splice(&l_inactive, &l_active); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&l_active); + free_unref_page_list(&l_active); + trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, + nr_deactivate, nr_rotated, sc->priority, file); +} + +static unsigned int reclaim_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat) +{ + struct reclaim_stat dummy_stat; + unsigned int nr_reclaimed; + struct folio *folio; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .no_demotion = 1, + }; + + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + + return nr_reclaimed; +} + +unsigned long reclaim_pages(struct list_head *folio_list) +{ + int nid; + unsigned int nr_reclaimed = 0; + LIST_HEAD(node_folio_list); + unsigned int noreclaim_flag; + + if (list_empty(folio_list)) + return nr_reclaimed; + + noreclaim_flag = memalloc_noreclaim_save(); + + nid = folio_nid(lru_to_folio(folio_list)); + do { + struct folio *folio = lru_to_folio(folio_list); + + if (nid == folio_nid(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &node_folio_list); + continue; + } + + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + nid = folio_nid(lru_to_folio(folio_list)); + } while (!list_empty(folio_list)); + + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); + + memalloc_noreclaim_restore(noreclaim_flag); + + return nr_reclaimed; +} + +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + +/* + * The inactive anon list should be small enough that the VM never has + * to do too much work. + * + * The inactive file list should be small enough to leave most memory + * to the established workingset on the scan-resistant active list, + * but large enough to avoid thrashing the aggregate readahead window. + * + * Both inactive lists should also be large enough that each inactive + * folio has a chance to be referenced again before it is reclaimed. + * + * If that fails and refaulting is observed, the inactive list grows. + * + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios + * on this LRU, maintained by the pageout code. An inactive_ratio + * of 3 means 3:1 or 25% of the folios are kept on the inactive list. + * + * total target max + * memory ratio inactive + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) +{ + enum lru_list active_lru = inactive_lru + LRU_ACTIVE; + unsigned long inactive, active; + unsigned long inactive_ratio; + unsigned long gb; + + inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); + active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); + + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + + return inactive * inactive_ratio < active; +} + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + +static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +{ + unsigned long file; + struct lruvec *target_lruvec; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + + /* + * Flush the memory cgroup stats, so that we read accurate per-memcg + * lruvec stats for heuristics. + */ + mem_cgroup_flush_stats(); + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = target_lruvec->anon_cost; + sc->file_cost = target_lruvec->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } +} + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. + * + * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan + * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan + */ +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) +{ + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + unsigned long anon_cost, file_cost, total_cost; + int swappiness = mem_cgroup_swappiness(memcg); + u64 fraction[ANON_AND_FILE]; + u64 denominator = 0; /* gcc */ + enum scan_balance scan_balance; + unsigned long ap, fp; + enum lru_list lru; + + /* If we have no swap space, do not bother scanning anon folios. */ + if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Global reclaim will swap to prevent OOM even with no + * swappiness, but memcg users want to use this knob to + * disable swapping for individual groups completely when + * using the memory controller's swap limit feature would be + * too expensive. + */ + if (cgroup_reclaim(sc) && !swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + /* + * If the system is almost out of file pages, force-scan anon. + */ + if (sc->file_is_tiny) { + scan_balance = SCAN_ANON; + goto out; + } + + /* + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now. + */ + if (sc->cache_trim_mode) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp; +out: + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long lruvec_size; + unsigned long low, min; + unsigned long scan; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, + &min, &low); + + if (min || low) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + (cgroup_size + 1); + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decrementing + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; + + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(lruvec_size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. + */ + scan = mem_cgroup_online(memcg) ? + div64_u64(scan * fraction[file], denominator) : + DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + + nr[lru] = scan; + } +} + +/* + * Anonymous LRU management is a waste if there is + * ultimately no way to reclaim the memory. + */ +static bool can_age_anon_pages(struct pglist_data *pgdat, + struct scan_control *sc) +{ + /* Aging the anon LRU is valuable if swap is present: */ + if (total_swap_pages > 0) + return true; + + /* Also valuable if anon pages can be demoted: */ + return can_demote(pgdat->node_id, sc); +} + +#ifdef CONFIG_LRU_GEN + +#ifdef CONFIG_LRU_GEN_ENABLED +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) +#else +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) +#endif + +/****************************************************************************** + * shorthand helpers + ******************************************************************************/ + +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) + +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ + } + +#define for_each_gen_type_zone(gen, type, zone) \ + for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ + for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ + for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) + +static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + +#ifdef CONFIG_MEMCG + if (memcg) { + struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; + + /* for hotadd_new_pgdat() */ + if (!lruvec->pgdat) + lruvec->pgdat = pgdat; + + return lruvec; + } +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return pgdat ? &pgdat->__lruvec : NULL; +} + +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!can_demote(pgdat->node_id, sc) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; + + return mem_cgroup_swappiness(memcg); +} + +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; +} + +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + /* see the comment on lru_gen_struct */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; +} + +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + VM_WARN_ON_ONCE(!mem_cgroup_disabled()); + + return &mm_list; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list)); +#ifdef CONFIG_MEMCG + VM_WARN_ON_ONCE(mm->lru_gen.memcg); + mm->lru_gen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + for_each_node_state(nid, N_MEMORY) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + /* the first addition since the last iteration */ + if (lruvec->mm_state.tail == &mm_list->fifo) + lruvec->mm_state.tail = &mm->lru_gen.list; + } + + list_add_tail(&mm->lru_gen.list, &mm_list->fifo); + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lru_gen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lru_gen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + /* where the last iteration ended (exclusive) */ + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; + + /* where the current iteration continues (inclusive) */ + if (lruvec->mm_state.head != &mm->lru_gen.list) + continue; + + lruvec->mm_state.head = lruvec->mm_state.head->next; + /* the deletion ends the current iteration */ + if (lruvec->mm_state.head == &mm_list->fifo) + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + mm->lru_gen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + struct task_struct *task = rcu_dereference_protected(mm->owner, true); + + VM_WARN_ON_ONCE(task->mm != mm); + lockdep_assert_held(&task->alloc_lock); + + /* for mm_update_next_owner() */ + if (mem_cgroup_disabled()) + return; + + /* migration can happen before addition */ + if (!mm->lru_gen.memcg) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(task); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it produces a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1. It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2. And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3. Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + +static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return true; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +{ + int i; + int hist; + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + if (walk) { + hist = lru_hist_from_seq(walk->max_seq); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(lruvec->mm_state.stats[hist][i], + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; + } + } + + if (NR_HIST_GENS > 1 && last) { + hist = lru_hist_from_seq(lruvec->mm_state.seq + 1); + + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); + } +} + +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + int type; + unsigned long size = 0; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap); + + if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap)) + return true; + + clear_bit(key, &mm->lru_gen.bitmap); + + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { + size += type ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < MIN_LRU_BATCH) + return true; + + return !mmget_not_zero(mm); +} + +static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, + struct mm_struct **iter) +{ + bool first = false; + bool last = true; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* + * There are four interesting cases for this page table walker: + * 1. It tries to start a new iteration of mm_list with a stale max_seq; + * there is nothing left to do. + * 2. It's the first of the current generation, and it needs to reset + * the Bloom filter for the next generation. + * 3. It reaches the end of mm_list, and it needs to increment + * mm_state->seq; the iteration is done. + * 4. It's the last of the current generation, and it needs to reset the + * mm stats counters for the next generation. + */ + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq); + VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq); + VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers); + + if (walk->max_seq <= mm_state->seq) { + if (!*iter) + last = false; + goto done; + } + + if (!mm_state->nr_walkers) { + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); + + mm_state->head = mm_list->fifo.next; + first = true; + } + + while (!mm && mm_state->head != &mm_list->fifo) { + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + + mm_state->head = mm_state->head->next; + + /* force scan for those added after the last iteration */ + if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) { + mm_state->tail = mm_state->head; + walk->force_scan = true; + } + + if (should_skip_mm(mm, walk)) + mm = NULL; + } + + if (mm_state->head == &mm_list->fifo) + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); +done: + if (*iter && !mm) + mm_state->nr_walkers--; + if (!*iter && mm) + mm_state->nr_walkers++; + + if (mm_state->nr_walkers) + last = false; + + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + + spin_unlock(&mm_list->lock); + + if (mm && first) + reset_bloom_filter(lruvec, walk->max_seq + 1); + + if (*iter) + mmput_async(*iter); + + *iter = mm; + + return last; +} + +static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) +{ + bool success = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + spin_lock(&mm_list->lock); + + VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq); + + if (max_seq > mm_state->seq && !mm_state->nr_walkers) { + VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo); + + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); + reset_mm_stats(lruvec, NULL, true); + success = true; + } + + spin_unlock(&mm_list->lock); + + return success; +} + +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. + * + * The P term is refaulted/(evicted+protected) from a tier in the generation + * currently being evicted; the I term is the exponential moving average of the + * P term over the generations previously evicted, using the smoothing factor + * 1/2; the D term isn't supported. + * + * The setpoint (SP) is always the first tier of one type; the process variable + * (PV) is either any tier of the other type or any other tier of the same + * type. + * + * The error is the difference between the SP and the PV; the correction is to + * turn off protection when SP>PV or turn on protection when SPlrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) +{ + int hist, tier; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + + lockdep_assert_held(&lruvec->lru_lock); + + if (!carryover && !clear) + return; + + hist = lru_hist_from_seq(seq); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Return true if the PV has a limited number of refaults or a lower + * refaulted/total than the SP. + */ + return pv->refaulted < MIN_LRU_BATCH || + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= + (sp->refaulted + 1) * pv->total * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +/* promote pages accessed through page tables */ +static int folio_update_gen(struct folio *folio, int gen) +{ + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(!rcu_read_lock_held()); + + do { + /* lru_gen_del_folio() has isolated this page? */ + if (!(old_flags & LRU_GEN_MASK)) { + /* for shrink_folio_list() */ + new_flags = old_flags | BIT(PG_referenced); + continue; + } + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + + return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; +} + +/* protect pages accessed multiple times through file descriptors */ +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + int type = folio_is_file_lru(folio); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); + + do { + new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; + /* folio_update_gen() has promoted this page? */ + if (new_gen >= 0 && new_gen != old_gen) + return new_gen; + + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + /* for folio_end_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + + return new_gen; +} + +static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio, + int old_gen, int new_gen) +{ + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS); + VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS); + + walk->batched++; + + walk->nr_pages[old_gen][type][zone] -= delta; + walk->nr_pages[new_gen][type][zone] += delta; +} + +static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk) +{ + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + walk->batched = 0; + + for_each_gen_type_zone(gen, type, zone) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + int delta = walk->nr_pages[gen][type][zone]; + + if (!delta) + continue; + + walk->nr_pages[gen][type][zone] = 0; + WRITE_ONCE(lrugen->nr_pages[gen][type][zone], + lrugen->nr_pages[gen][type][zone] + delta); + + if (lru_gen_is_active(lruvec, gen)) + lru += LRU_ACTIVE; + __update_lru_size(lruvec, lru, zone, delta); + } +} + +static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args) +{ + struct address_space *mapping; + struct vm_area_struct *vma = args->vma; + struct lru_gen_mm_walk *walk = args->private; + + if (!vma_is_accessible(vma)) + return true; + + if (is_vm_hugetlb_page(vma)) + return true; + + if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ)) + return true; + + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + if (vma_is_anonymous(vma)) + return !walk->can_swap; + + if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) + return true; + + mapping = vma->vm_file->f_mapping; + if (mapping_unevictable(mapping)) + return true; + + if (shmem_mapping(mapping)) + return !walk->can_swap; + + /* to exclude special mappings like dax, etc. */ + return !mapping->a_ops->read_folio; +} + +/* + * Some userspace memory allocators map many single-page VMAs. Instead of + * returning back to the PGD table for each of such VMAs, finish an entire PMD + * table to reduce zigzags and improve cache performance. + */ +static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args, + unsigned long *vm_start, unsigned long *vm_end) +{ + unsigned long start = round_up(*vm_end, size); + unsigned long end = (start | ~mask) + 1; + VMA_ITERATOR(vmi, args->mm, start); + + VM_WARN_ON_ONCE(mask & size); + VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); + + for_each_vma(vmi, args->vma) { + if (end && end <= args->vma->vm_start) + return false; + + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) + continue; + + *vm_start = max(start, args->vma->vm_start); + *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; + + return true; + } + + return false; +} + +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pte_pfn(pte); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pte_present(pte) || is_zero_pfn(pfn)) + return -1; + + if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +{ + unsigned long pfn = pmd_pfn(pmd); + + VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end); + + if (!pmd_present(pmd) || is_huge_zero_pmd(pmd)) + return -1; + + if (WARN_ON_ONCE(pmd_devmap(pmd))) + return -1; + + if (WARN_ON_ONCE(!pfn_valid(pfn))) + return -1; + + return pfn; +} +#endif + +static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, + struct pglist_data *pgdat, bool can_swap) +{ + struct folio *folio; + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return NULL; + + folio = pfn_folio(pfn); + if (folio_nid(folio) != pgdat->node_id) + return NULL; + + if (folio_memcg_rcu(folio) != memcg) + return NULL; + + /* file VMAs can contain anon pages from COW */ + if (!folio_is_file_lru(folio) && !can_swap) + return NULL; + + return folio; +} + +static bool suitable_to_scan(int total, int young) +{ + int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8); + + /* suitable if the average number of young PTEs per cacheline is >=1 */ + return young * n >= total; +} + +static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pte_t *pte; + spinlock_t *ptl; + unsigned long addr; + int total = 0; + int young = 0; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pmd_leaf(*pmd)); + + ptl = pte_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + return false; + + arch_enter_lazy_mmu_mode(); + + pte = pte_offset_map(pmd, start & PMD_MASK); +restart: + for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + struct folio *folio; + + total++; + walk->mm_stats[MM_LEAF_TOTAL]++; + + pfn = get_pte_pfn(pte[i], args->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + if (!folio) + continue; + + if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } + + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) + goto restart; + + pte_unmap(pte); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); + + return suitable_to_scan(total, young); +} + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ + int i; + pmd_t *pmd; + spinlock_t *ptl; + struct lru_gen_mm_walk *walk = args->private; + struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + int old_gen, new_gen = lru_gen_from_seq(walk->max_seq); + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* try to batch at most 1+MIN_LRU_BATCH+1 entries */ + if (*start == -1) { + *start = next; + return; + } + + i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start); + if (i && i <= MIN_LRU_BATCH) { + __set_bit(i - 1, bitmap); + return; + } + + pmd = pmd_offset(pud, *start); + + ptl = pmd_lockptr(args->mm, pmd); + if (!spin_trylock(ptl)) + goto done; + + arch_enter_lazy_mmu_mode(); + + do { + unsigned long pfn; + struct folio *folio; + unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start; + + pfn = get_pmd_pfn(pmd[i], vma, addr); + if (pfn == -1) + goto next; + + if (!pmd_trans_huge(pmd[i])) { + if (arch_has_hw_nonleaf_pmd_young() && + get_cap(LRU_GEN_NONLEAF_YOUNG)) + pmdp_test_and_clear_young(vma, addr, pmd + i); + goto next; + } + + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + if (!folio) + goto next; + + if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + goto next; + + walk->mm_stats[MM_LEAF_YOUNG]++; + + if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); +next: + i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; + } while (i <= MIN_LRU_BATCH); + + arch_leave_lazy_mmu_mode(); + spin_unlock(ptl); +done: + *start = -1; + bitmap_zero(bitmap, MIN_LRU_BATCH); +} +#else +static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma, + struct mm_walk *args, unsigned long *bitmap, unsigned long *start) +{ +} +#endif + +static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pmd_t *pmd; + unsigned long next; + unsigned long addr; + struct vm_area_struct *vma; + unsigned long pos = -1; + struct lru_gen_mm_walk *walk = args->private; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + + VM_WARN_ON_ONCE(pud_leaf(*pud)); + + /* + * Finish an entire PMD in two passes: the first only reaches to PTE + * tables to avoid taking the PMD lock; the second, if necessary, takes + * the PMD lock to clear the accessed bit in PMD entries. + */ + pmd = pmd_offset(pud, start & PUD_MASK); +restart: + /* walk_pte_range() may call get_next_vma() */ + vma = args->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { + pmd_t val = pmd_read_atomic(pmd + i); + + /* for pmd_read_atomic() */ + barrier(); + + next = pmd_addr_end(addr, end); + + if (!pmd_present(val) || is_huge_zero_pmd(val)) { + walk->mm_stats[MM_LEAF_TOTAL]++; + continue; + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(val)) { + unsigned long pfn = pmd_pfn(val); + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + + walk->mm_stats[MM_LEAF_TOTAL]++; + + if (!pmd_young(val)) { + walk->mm_stats[MM_LEAF_OLD]++; + continue; + } + + /* try to avoid unnecessary memory loads */ + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + continue; + } +#endif + walk->mm_stats[MM_NONLEAF_TOTAL]++; + + if (arch_has_hw_nonleaf_pmd_young() && + get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (!pmd_young(val)) + continue; + + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); + } + + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) + continue; + + walk->mm_stats[MM_NONLEAF_FOUND]++; + + if (!walk_pte_range(&val, addr, next, args)) + continue; + + walk->mm_stats[MM_NONLEAF_ADDED]++; + + /* carry over to the next generation */ + update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i); + } + + walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos); + + if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end)) + goto restart; +} + +static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end, + struct mm_walk *args) +{ + int i; + pud_t *pud; + unsigned long addr; + unsigned long next; + struct lru_gen_mm_walk *walk = args->private; + + VM_WARN_ON_ONCE(p4d_leaf(*p4d)); + + pud = pud_offset(p4d, start & P4D_MASK); +restart: + for (i = pud_index(start), addr = start; addr != end; i++, addr = next) { + pud_t val = READ_ONCE(pud[i]); + + next = pud_addr_end(addr, end); + + if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val))) + continue; + + walk_pmd_range(&val, addr, next, args); + + /* a racy check to curtail the waiting time */ + if (wq_has_sleeper(&walk->lruvec->mm_state.wait)) + return 1; + + if (need_resched() || walk->batched >= MAX_LRU_BATCH) { + end = (addr | ~PUD_MASK) + 1; + goto done; + } + } + + if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end)) + goto restart; + + end = round_up(end, P4D_SIZE); +done: + if (!end || !args->vma) + return 1; + + walk->next_addr = max(end, args->vma->vm_start); + + return -EAGAIN; +} + +static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + static const struct mm_walk_ops mm_walk_ops = { + .test_walk = should_skip_vma, + .p4d_entry = walk_pud_range, + }; + + int err; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + walk->next_addr = FIRST_USER_ADDRESS; + + do { + err = -EBUSY; + + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + break; + + /* the caller might be holding the lock for write */ + if (mmap_read_trylock(mm)) { + err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); + + mmap_read_unlock(mm); + } + + mem_cgroup_unlock_pages(); + + if (walk->batched) { + spin_lock_irq(&lruvec->lru_lock); + reset_batch_size(lruvec, walk); + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while (err == -EAGAIN); +} + +static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + if (pgdat && current_is_kswapd()) { + VM_WARN_ON_ONCE(walk); + + walk = &pgdat->mm_walk; + } else if (!pgdat && !walk) { + VM_WARN_ON_ONCE(current_is_kswapd()); + + walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + } + + current->reclaim_state->mm_walk = walk; + + return walk; +} + +static void clear_mm_walk(void) +{ + struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk; + + VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages))); + VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats))); + + current->reclaim_state->mm_walk = NULL; + + if (!current_is_kswapd()) + kfree(walk); +} + +static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +{ + int zone; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + + if (type == LRU_GEN_ANON && !can_swap) + goto done; + + /* prevent cold/hot inversion if force_scan is true */ + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + struct list_head *head = &lrugen->folios[old_gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + new_gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + + if (!--remaining) + return false; + } + } +done: + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); + + return true; +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +{ + int gen, type, zone; + bool success = false; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + /* find the oldest populated generation */ + for (type = !can_swap; type < ANON_AND_FILE; type++) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->folios[gen][type][zone])) + goto next; + } + + min_seq[type]++; + } +next: + ; + } + + /* see the comment on lru_gen_struct */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + if (min_seq[type] == lrugen->min_seq[type]) + continue; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) +{ + int prev, next; + int type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + +restart: + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + + VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); + + if (inc_min_seq(lruvec, type, can_swap)) + continue; + + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + goto restart; + } + + /* + * Update the active/inactive LRU sizes for compatibility. Both sides of + * the current max_seq need to be covered, since max_seq+1 can overlap + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do + * overlap, cold/hot inversion happens. + */ + prev = lru_gen_from_seq(lrugen->max_seq - 1); + next = lru_gen_from_seq(lrugen->max_seq + 1); + + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; + + if (!delta) + continue; + + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + + WRITE_ONCE(lrugen->timestamps[next], jiffies); + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); + + spin_unlock_irq(&lruvec->lru_lock); +} + +static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, + struct scan_control *sc, bool can_swap, bool force_scan) +{ + bool success; + struct lru_gen_mm_walk *walk; + struct mm_struct *mm = NULL; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq)); + + /* see the comment in iterate_mm_list() */ + if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) { + success = false; + goto done; + } + + /* + * If the hardware doesn't automatically set the accessed bit, fallback + * to lru_gen_look_around(), which only clears the accessed bit in a + * handful of PTEs. Spreading the work out over a period of time usually + * is less efficient, but it avoids bursty page faults. + */ + if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk = set_mm_walk(NULL); + if (!walk) { + success = iterate_mm_list_nowalk(lruvec, max_seq); + goto done; + } + + walk->lruvec = lruvec; + walk->max_seq = max_seq; + walk->can_swap = can_swap; + walk->force_scan = force_scan; + + do { + success = iterate_mm_list(lruvec, walk, &mm); + if (mm) + walk_mm(lruvec, mm, walk); + + cond_resched(); + } while (mm); +done: + if (!success) { + if (sc->priority <= DEF_PRIORITY - 2) + wait_event_killable(lruvec->mm_state.wait, + max_seq < READ_ONCE(lrugen->max_seq)); + + return max_seq < READ_ONCE(lrugen->max_seq); + } + + VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); + + inc_max_seq(lruvec, can_swap, force_scan); + /* either this sees any waiters or they will see updated max_seq */ + if (wq_has_sleeper(&lruvec->mm_state.wait)) + wake_up_all(&lruvec->mm_state.wait); + + return true; +} + +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) + return true; + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl) +{ + bool need_aging; + unsigned long nr_to_scan; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) + return false; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + + if (min_ttl) { + int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + if (time_is_after_jiffies(birth + min_ttl)) + return false; + + /* the size is likely too small to be helpful */ + if (!nr_to_scan && sc->priority != DEF_PRIORITY) + return false; + } + + if (need_aging) + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); + + return true; +} + +/* to protect the working set of the last N jiffies */ +static unsigned long lru_gen_min_ttl __read_mostly; + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + bool success = false; + unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl); + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + sc->last_reclaimed = sc->nr_reclaimed; + + /* + * To reduce the chance of going into the aging path, which can be + * costly, optimistically skip it if the flag below was cleared in the + * eviction path. This improves the overall performance when multiple + * memcgs are available. + */ + if (!sc->memcgs_need_aging) { + sc->memcgs_need_aging = true; + return; + } + + set_mm_walk(pgdat); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + if (age_lruvec(lruvec, sc, min_ttl)) + success = true; + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + clear_mm_walk(); + + /* check the order to exclude compaction-induced reclaim */ + if (success || !min_ttl || sc->order) + return; + + /* + * The main goal is to OOM kill if every generation from all memcgs is + * younger than min_ttl. However, another possibility is all memcgs are + * either below min or empty. + */ + if (mutex_trylock(&oom_lock)) { + struct oom_control oc = { + .gfp_mask = sc->gfp_mask, + }; + + out_of_memory(&oc); + + mutex_unlock(&oom_lock); + } +} + +/* + * This function exploits spatial locality when shrink_folio_list() walks the + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If + * the scan was done cacheline efficiently, it adds the PMD entry pointing to + * the PTE table to the Bloom filter. This forms a feedback loop between the + * eviction and the aging. + */ +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +{ + int i; + pte_t *pte; + unsigned long start; + unsigned long end; + unsigned long addr; + struct lru_gen_mm_walk *walk; + int young = 0; + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {}; + struct folio *folio = pfn_folio(pvmw->pfn); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + DEFINE_MAX_SEQ(lruvec); + int old_gen, new_gen = lru_gen_from_seq(max_seq); + + lockdep_assert_held(pvmw->ptl); + VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + + if (spin_is_contended(pvmw->ptl)) + return; + + /* avoid taking the LRU lock under the PTL when possible */ + walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; + + start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start); + end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1; + + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { + if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2) + end = start + MIN_LRU_BATCH * PAGE_SIZE; + else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2) + start = end - MIN_LRU_BATCH * PAGE_SIZE; + else { + start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2; + end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2; + } + } + + pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE; + + rcu_read_lock(); + arch_enter_lazy_mmu_mode(); + + for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) { + unsigned long pfn; + + pfn = get_pte_pfn(pte[i], pvmw->vma, addr); + if (pfn == -1) + continue; + + if (!pte_young(pte[i])) + continue; + + folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); + if (!folio) + continue; + + if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i)) + VM_WARN_ON_ONCE(true); + + young++; + + if (pte_dirty(pte[i]) && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + old_gen = folio_lru_gen(folio); + if (old_gen < 0) + folio_set_referenced(folio); + else if (old_gen != new_gen) + __set_bit(i, bitmap); + } + + arch_leave_lazy_mmu_mode(); + rcu_read_unlock(); + + /* feedback from rmap walkers to page table walkers */ + if (suitable_to_scan(i, young)) + update_bloom_filter(lruvec, max_seq, pvmw->pmd); + + if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) { + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + folio_activate(folio); + } + return; + } + + /* folio_update_gen() requires stable folio_memcg() */ + if (!mem_cgroup_trylock_pages(memcg)) + return; + + if (!walk) { + spin_lock_irq(&lruvec->lru_lock); + new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq); + } + + for_each_set_bit(i, bitmap, MIN_LRU_BATCH) { + folio = pfn_folio(pte_pfn(pte[i])); + if (folio_memcg_rcu(folio) != memcg) + continue; + + old_gen = folio_update_gen(folio, new_gen); + if (old_gen < 0 || old_gen == new_gen) + continue; + + if (walk) + update_batch_size(walk, folio, old_gen, new_gen); + else + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + } + + if (!walk) + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_unlock_pages(); +} + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, + int tier_idx) +{ + bool success; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); + + /* unevictable */ + if (!folio_evictable(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* dirty lazyfree */ + if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_swapbacked(folio); + lruvec_add_folio_tail(lruvec, folio); + return true; + } + + /* promoted */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + + /* protected */ + if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + return true; + } + + /* ineligible */ + if (zone > sc->reclaim_idx) { + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + + /* waiting for writeback */ + if (folio_test_locked(folio) || folio_test_writeback(folio) || + (type == LRU_GEN_FILE && folio_test_dirty(folio))) { + gen = folio_inc_gen(lruvec, folio, true); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + + return false; +} + +static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) +{ + bool success; + + /* unmapping inhibited */ + if (!sc->may_unmap && folio_mapped(folio)) + return false; + + /* swapping inhibited */ + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (folio_test_dirty(folio) || + (folio_test_anon(folio) && !folio_test_swapcache(folio)))) + return false; + + /* raced with release_pages() */ + if (!folio_try_get(folio)) + return false; + + /* raced with another isolation */ + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return false; + } + + /* see the comment on MAX_NR_TIERS */ + if (!folio_test_referenced(folio)) + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + + /* for shrink_folio_list() */ + folio_clear_reclaim(folio); + folio_clear_referenced(folio); + + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + + return true; +} + +static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int i; + int gen; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (i = MAX_NR_ZONES; i > 0; i--) { + LIST_HEAD(moved); + int skipped = 0; + int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; + struct list_head *head = &lrugen->folios[gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + scanned += delta; + + if (sort_folio(lruvec, folio, sc, tier)) + sorted += delta; + else if (isolate_folio(lruvec, folio, sc)) { + list_add(&folio->lru, list); + isolated += delta; + } else { + list_move(&folio->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_LRU_BATCH) + break; + } + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* + * There might not be eligible pages due to reclaim_idx, may_unmap and + * may_writepage. Check the remaining to prevent livelock if it's not + * making progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * To leave a margin for fluctuations, use a larger gain factor (1:2). + * This value is chosen because any other tier would have at least twice + * as many refaults as the first tier. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare the first tier of anon with that of file to determine which + * type to scan. Also need to compare other tiers of the selected type + * with the first tier of the other type to determine the last tier (of + * the selected type) to evict. + */ + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + /* + * Try to make the obvious choice first. When anon and file are both + * available from the same generation, interpret swappiness 1 as file + * first and 200 as anon first. + */ + if (!swappiness) + type = LRU_GEN_FILE; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) + type = LRU_GEN_FILE; + else if (swappiness == 200) + type = LRU_GEN_ANON; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_folios(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + bool *need_swapping) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + LIST_HEAD(clean); + struct folio *folio; + struct folio *next; + enum vm_event_item item; + struct reclaim_stat stat; + struct lru_gen_mm_walk *walk; + bool skip_retry = false; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + + scanned += try_to_inc_min_seq(lruvec, swappiness); + + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; +retry: + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + sc->nr_reclaimed += reclaimed; + + list_for_each_entry_safe_reverse(folio, next, &list, lru) { + if (!folio_evictable(folio)) { + list_del(&folio->lru); + folio_putback_lru(folio); + continue; + } + + if (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio))) { + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ + if (folio_test_workingset(folio)) + folio_set_referenced(folio); + continue; + } + + if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || + folio_mapped(folio) || folio_test_locked(folio) || + folio_test_dirty(folio) || folio_test_writeback(folio)) { + /* don't add rejected folios to the oldest generation */ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, + BIT(PG_active)); + continue; + } + + /* retry folios that may have missed folio_rotate_reclaimable() */ + list_move(&folio->lru, &clean); + sc->nr_scanned -= folio_nr_pages(folio); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_folios_to_lru(lruvec, &list); + + walk = current->reclaim_state->mm_walk; + if (walk && walk->batched) + reset_batch_size(lruvec, walk); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + INIT_LIST_HEAD(&list); + list_splice_init(&clean, &list); + + if (!list_empty(&list)) { + skip_retry = true; + goto retry; + } + + if (need_swapping && type == LRU_GEN_ANON) + *need_swapping = true; + + return scanned; +} + +/* + * For future optimizations: + * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg + * reclaim. + */ +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + bool can_swap, bool *need_aging) +{ + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + + *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); + if (!*need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) + goto done; + + /* leave the work to lru_gen_age_node() */ + if (current_is_kswapd()) + return 0; + + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) + return nr_to_scan; +done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +} + +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, + struct scan_control *sc, bool need_swapping) +{ + int i; + DEFINE_MAX_SEQ(lruvec); + + if (!current_is_kswapd()) { + /* age each memcg at most once to ensure fairness */ + if (max_seq - seq > 1) + return true; + + /* over-swapping can increase allocation latency */ + if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping) + return true; + + /* give this thread a chance to exit and free its memory */ + if (fatal_signal_pending(current)) { + sc->nr_reclaimed += MIN_LRU_BATCH; + return true; + } + + if (cgroup_reclaim(sc)) + return false; + } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim) + return false; + + /* keep scanning at low priorities to ensure fairness */ + if (sc->priority > DEF_PRIORITY - 2) + return false; + + /* + * A minimum amount of work was done under global memory pressure. For + * kswapd, it may be overshooting. For direct reclaim, the allocation + * may succeed if all suitable zones are somewhat safe. In either case, + * it's better to stop now, and restart later if necessary. + */ + for (i = 0; i <= sc->reclaim_idx; i++) { + unsigned long wmark; + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + + if (!managed_zone(zone)) + continue; + + wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone); + if (wmark > zone_page_state(zone, NR_FREE_PAGES)) + return false; + } + + sc->nr_reclaimed += MIN_LRU_BATCH; + + return true; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + bool need_aging = false; + bool need_swapping = false; + unsigned long scanned = 0; + unsigned long reclaimed = sc->nr_reclaimed; + DEFINE_MAX_SEQ(lruvec); + + lru_add_drain(); + + blk_start_plug(&plug); + + set_mm_walk(lruvec_pgdat(lruvec)); + + while (true) { + int delta; + int swappiness; + unsigned long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(lruvec, sc); + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging); + if (!nr_to_scan) + goto done; + + delta = evict_folios(lruvec, sc, swappiness, &need_swapping); + if (!delta) + goto done; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + if (should_abort_scan(lruvec, max_seq, sc, need_swapping)) + break; + + cond_resched(); + } + + /* see the comment in lru_gen_age_node() */ + if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging) + sc->memcgs_need_aging = false; +done: + clear_mm_walk(); + + blk_finish_plug(&plug); +} + +/****************************************************************************** + * state change + ******************************************************************************/ + +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + if (lrugen->enabled) { + enum lru_list lru; + + for_each_evictable_lru(lru) { + if (!list_empty(&lruvec->lists[lru])) + return false; + } + } else { + int gen, type, zone; + + for_each_gen_type_zone(gen, type, zone) { + if (!list_empty(&lrugen->folios[gen][type][zone])) + return false; + } + } + + return true; +} + +static bool fill_evictable(struct lruvec *lruvec) +{ + enum lru_list lru; + int remaining = MAX_LRU_BATCH; + + for_each_evictable_lru(lru) { + int type = is_file_lru(lru); + bool active = is_active_lru(lru); + struct list_head *head = &lruvec->lists[lru]; + + while (!list_empty(head)) { + bool success; + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); + + lruvec_del_folio(lruvec, folio); + success = lru_gen_add_folio(lruvec, folio, false); + VM_WARN_ON_ONCE(!success); + + if (!--remaining) + return false; + } + } + + return true; +} + +static bool drain_evictable(struct lruvec *lruvec) +{ + int gen, type, zone; + int remaining = MAX_LRU_BATCH; + + for_each_gen_type_zone(gen, type, zone) { + struct list_head *head = &lruvec->lrugen.folios[gen][type][zone]; + + while (!list_empty(head)) { + bool success; + struct folio *folio = lru_to_folio(head); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + success = lru_gen_del_folio(lruvec, folio, false); + VM_WARN_ON_ONCE(!success); + lruvec_add_folio(lruvec, folio); + + if (!--remaining) + return false; + } + } + + return true; +} + +static void lru_gen_change_state(bool enabled) +{ + static DEFINE_MUTEX(state_mutex); + + struct mem_cgroup *memcg; + + cgroup_lock(); + cpus_read_lock(); + get_online_mems(); + mutex_lock(&state_mutex); + + if (enabled == lru_gen_enabled()) + goto unlock; + + if (enabled) + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + else + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + VM_WARN_ON_ONCE(!state_is_valid(lruvec)); + + lruvec->lrugen.enabled = enabled; + + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + spin_lock_irq(&lruvec->lru_lock); + } + + spin_unlock_irq(&lruvec->lru_lock); + } + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +unlock: + mutex_unlock(&state_mutex); + put_online_mems(); + cpus_read_unlock(); + cgroup_unlock(); +} + +/****************************************************************************** + * sysfs interface + ******************************************************************************/ + +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + unsigned int msecs; + + if (kstrtouint(buf, 0, &msecs)) + return -EINVAL; + + WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs)); + + return len; +} + +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( + min_ttl_ms, 0644, show_min_ttl, store_min_ttl +); + +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned int caps = 0; + + if (get_cap(LRU_GEN_CORE)) + caps |= BIT(LRU_GEN_CORE); + + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) + caps |= BIT(LRU_GEN_MM_WALK); + + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); + + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) +{ + int i; + unsigned int caps; + + if (tolower(*buf) == 'n') + caps = 0; + else if (tolower(*buf) == 'y') + caps = -1; + else if (kstrtouint(buf, 0, &caps)) + return -EINVAL; + + for (i = 0; i < NR_LRU_GEN_CAPS; i++) { + bool enabled = caps & BIT(i); + + if (i == LRU_GEN_CORE) + lru_gen_change_state(enabled); + else if (enabled) + static_branch_enable(&lru_gen_caps[i]); + else + static_branch_disable(&lru_gen_caps[i]); + } + + return len; +} + +static struct kobj_attribute lru_gen_enabled_attr = __ATTR( + enabled, 0644, show_enabled, store_enabled +); + +static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, + &lru_gen_enabled_attr.attr, + NULL +}; + +static struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, +}; + +/****************************************************************************** + * debugfs interface + ******************************************************************************/ + +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg; + loff_t nr_to_skip = *pos; + + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); + if (!m->private) + return ERR_PTR(-ENOMEM); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!nr_to_skip--) + return get_lruvec(memcg, nid); + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); + + return NULL; +} + +static void lru_gen_seq_stop(struct seq_file *m, void *v) +{ + if (!IS_ERR_OR_NULL(v)) + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); + + kvfree(m->private); + m->private = NULL; +} + +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int nid = lruvec_pgdat(v)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(v); + + ++*pos; + + nid = next_memory_node(nid); + if (nid == MAX_NUMNODES) { + memcg = mem_cgroup_iter(NULL, memcg, NULL); + if (!memcg) + return NULL; + + nid = first_memory_node; + } + + return get_lruvec(memcg, nid); +} + +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, + unsigned long max_seq, unsigned long *min_seq, + unsigned long seq) +{ + int i; + int type, tier; + int hist = lru_hist_from_seq(seq); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + seq_printf(m, " %10d", tier); + for (type = 0; type < ANON_AND_FILE; type++) { + const char *s = " "; + unsigned long n[3] = {}; + + if (seq == max_seq) { + s = "RT "; + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { + s = "rep"; + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + } + + for (i = 0; i < 3; i++) + seq_printf(m, " %10lu%c", n[i], s[i]); + } + seq_putc(m, '\n'); + } + + seq_puts(m, " "); + for (i = 0; i < NR_MM_STATS; i++) { + const char *s = " "; + unsigned long n = 0; + + if (seq == max_seq && NR_HIST_GENS == 1) { + s = "LOYNFA"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } else if (seq != max_seq && NR_HIST_GENS > 1) { + s = "loynfa"; + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); + } + + seq_printf(m, " %10lu%c", n, s[i]); + } + seq_putc(m, '\n'); +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static int lru_gen_seq_show(struct seq_file *m, void *v) +{ + unsigned long seq; + bool full = !debugfs_real_fops(m->file)->write; + struct lruvec *lruvec = v; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int nid = lruvec_pgdat(lruvec)->node_id; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (nid == first_memory_node) { + const char *path = memcg ? m->private : ""; + +#ifdef CONFIG_MEMCG + if (memcg) + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); +#endif + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); + } + + seq_printf(m, " node %5d\n", nid); + + if (!full) + seq = min_seq[LRU_GEN_ANON]; + else if (max_seq >= MAX_NR_GENS) + seq = max_seq - MAX_NR_GENS + 1; + else + seq = 0; + + for (; seq <= max_seq; seq++) { + int type, zone; + int gen = lru_gen_from_seq(seq); + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); + + seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); + + for (type = 0; type < ANON_AND_FILE; type++) { + unsigned long size = 0; + char mark = full && seq < min_seq[type] ? 'x' : ' '; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + seq_printf(m, " %10lu%c", size, mark); + } + + seq_putc(m, '\n'); + + if (full) + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); + } + + return 0; +} + +static const struct seq_operations lru_gen_seq_ops = { + .start = lru_gen_seq_start, + .stop = lru_gen_seq_stop, + .next = lru_gen_seq_next, + .show = lru_gen_seq_show, +}; + +static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + bool can_swap, bool force_scan) +{ + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (seq < max_seq) + return 0; + + if (seq > max_seq) + return -EINVAL; + + if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) + return -ERANGE; + + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); + + return 0; +} + +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, + int swappiness, unsigned long nr_to_reclaim) +{ + DEFINE_MAX_SEQ(lruvec); + + if (seq + MIN_NR_GENS > max_seq) + return -EINVAL; + + sc->nr_reclaimed = 0; + + while (!signal_pending(current)) { + DEFINE_MIN_SEQ(lruvec); + + if (seq < min_seq[!swappiness]) + return 0; + + if (sc->nr_reclaimed >= nr_to_reclaim) + return 0; + + if (!evict_folios(lruvec, sc, swappiness, NULL)) + return 0; + + cond_resched(); + } + + return -EINTR; +} + +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, + struct scan_control *sc, int swappiness, unsigned long opt) +{ + struct lruvec *lruvec; + int err = -EINVAL; + struct mem_cgroup *memcg = NULL; + + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) + return -EINVAL; + + if (!mem_cgroup_disabled()) { + rcu_read_lock(); + memcg = mem_cgroup_from_id(memcg_id); +#ifdef CONFIG_MEMCG + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; +#endif + rcu_read_unlock(); + + if (!memcg) + return -EINVAL; + } + + if (memcg_id != mem_cgroup_id(memcg)) + goto done; + + lruvec = get_lruvec(memcg, nid); + + if (swappiness < 0) + swappiness = get_swappiness(lruvec, sc); + else if (swappiness > 200) + goto done; + + switch (cmd) { + case '+': + err = run_aging(lruvec, seq, sc, swappiness, opt); + break; + case '-': + err = run_eviction(lruvec, seq, sc, swappiness, opt); + break; + } +done: + mem_cgroup_put(memcg); + + return err; +} + +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, + size_t len, loff_t *pos) +{ + void *buf; + char *cur, *next; + unsigned int flags; + struct blk_plug plug; + int err = -EINVAL; + struct scan_control sc = { + .may_writepage = true, + .may_unmap = true, + .may_swap = true, + .reclaim_idx = MAX_NR_ZONES - 1, + .gfp_mask = GFP_KERNEL, + }; + + buf = kvmalloc(len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, src, len)) { + kvfree(buf); + return -EFAULT; + } + + set_task_reclaim_state(current, &sc.reclaim_state); + flags = memalloc_noreclaim_save(); + blk_start_plug(&plug); + if (!set_mm_walk(NULL)) { + err = -ENOMEM; + goto done; + } + + next = buf; + next[len] = '\0'; + + while ((cur = strsep(&next, ",;\n"))) { + int n; + int end; + char cmd; + unsigned int memcg_id; + unsigned int nid; + unsigned long seq; + unsigned int swappiness = -1; + unsigned long opt = -1; + + cur = skip_spaces(cur); + if (!*cur) + continue; + + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, + &seq, &end, &swappiness, &end, &opt, &end); + if (n < 4 || cur[end]) { + err = -EINVAL; + break; + } + + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); + if (err) + break; + } +done: + clear_mm_walk(); + blk_finish_plug(&plug); + memalloc_noreclaim_restore(flags); + set_task_reclaim_state(current, NULL); + + kvfree(buf); + + return err ? : len; +} + +static int lru_gen_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lru_gen_seq_ops); +} + +static const struct file_operations lru_gen_rw_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .write = lru_gen_seq_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations lru_gen_ro_fops = { + .open = lru_gen_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/****************************************************************************** + * initialization + ******************************************************************************/ + +void lru_gen_init_lruvec(struct lruvec *lruvec) +{ + int i; + int gen, type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + lrugen->max_seq = MIN_NR_GENS + 1; + lrugen->enabled = lru_gen_enabled(); + + for (i = 0; i <= MIN_NR_GENS + 1; i++) + lrugen->timestamps[i] = jiffies; + + for_each_gen_type_zone(gen, type, zone) + INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; + init_waitqueue_head(&lruvec->mm_state.wait); +} + +#ifdef CONFIG_MEMCG +void lru_gen_init_memcg(struct mem_cgroup *memcg) +{ + INIT_LIST_HEAD(&memcg->mm_list.fifo); + spin_lock_init(&memcg->mm_list.lock); +} + +void lru_gen_exit_memcg(struct mem_cgroup *memcg) +{ + int i; + int nid; + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0, + sizeof(lruvec->lrugen.nr_pages))); + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_state.filters[i]); + lruvec->mm_state.filters[i] = NULL; + } + } +} +#endif + +static int __init init_lru_gen(void) +{ + BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); + BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) + pr_err("lru_gen: failed to create sysfs group\n"); + + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); + + return 0; +}; +late_initcall(init_lru_gen); + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + +#endif /* CONFIG_LRU_GEN */ + +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + bool proportional_reclaim; + struct blk_plug plug; + + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } + + get_scan_count(lruvec, sc, nr); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + + blk_start_plug(&plug); + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + + for_each_evictable_lru(lru) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + + cond_resched(); + + if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) + continue; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) && + inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); +} + +/* Use reclaim/compaction for costly allocs or under memory pressure */ +static bool in_reclaim_compaction(struct scan_control *sc) +{ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + (sc->order > PAGE_ALLOC_COSTLY_ORDER || + sc->priority < DEF_PRIORITY - 2)) + return true; + + return false; +} + +/* + * Reclaim/compaction is used for high-order allocation requests. It reclaims + * order-0 pages before compacting the zone. should_continue_reclaim() returns + * true if more pages should be reclaimed such that when the page allocator + * calls try_to_compact_pages() that it will have enough free pages to succeed. + * It will give up earlier than that if there is difficulty reclaiming pages. + */ +static inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + int z; + + /* If not in reclaim/compaction mode, stop */ + if (!in_reclaim_compaction(sc)) + return false; + + /* + * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX + * number of pages that were scanned. This will return to the caller + * with the risk reclaim/compaction and the resulting allocation attempt + * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL + * allocations through requiring that the full LRU list has been scanned + * first, by assuming that zero delta of sc->nr_scanned means full LRU + * scan, but that approximation was wrong, and there were corner cases + * where always a non-zero amount of pages were scanned. + */ + if (!nr_reclaimed) + return false; + + /* If compaction would go ahead or the allocation would succeed, stop */ + for (z = 0; z <= sc->reclaim_idx; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!managed_zone(zone)) + continue; + + switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { + case COMPACT_SUCCESS: + case COMPACT_CONTINUE: + return false; + default: + /* check next zone */ + ; + } + } + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = compact_gap(sc->order); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); + if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc)) + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); + + return inactive_lru_pages > pages_for_compaction; +} + +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long reclaimed; + unsigned long scanned; + + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + + mem_cgroup_calculate_protection(target_memcg, memcg); + + if (mem_cgroup_below_min(memcg)) { + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + } else if (mem_cgroup_below_low(memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + } + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + shrink_lruvec(lruvec, sc); + + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + /* Record the group's reclaim efficiency */ + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); +} + +static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) +{ + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; + bool reclaimable = false; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + +again: + memset(&sc->nr, 0, sizeof(sc->nr)); + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + prepare_scan_count(pgdat, sc); + + shrink_node_memcgs(pgdat, sc); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + + /* Record the subtree's reclaim efficiency */ + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so forcibly stall + * until some pages complete writeback. + */ + if (sc->nr.immediate) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + + /* + * Tag a node/memcg as congested if all the dirty pages were marked + * for writeback and immediate reclaim (counted in nr.congested). + * + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in reclaim_throttle(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if the lruvec is + * node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED); + + if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)) + goto again; + + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; +} + +/* + * Returns true if compaction should go ahead for a costly-order request, or + * the allocation would already succeed without compaction. Return false if we + * should reclaim first. + */ +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +{ + unsigned long watermark; + enum compact_result suitable; + + suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx); + if (suitable == COMPACT_SUCCESS) + /* Allocation should succeed already. Don't reclaim. */ + return true; + if (suitable == COMPACT_SKIPPED) + /* Compaction cannot yet proceed. Do reclaim. */ + return false; + + /* + * Compaction is already possible, but it takes time to run and there + * are potentially other callers using the pages just freed. So proceed + * with reclaim to make a buffer of free pages available to give + * compaction a reasonable chance of completing and allocating the page. + * Note that we won't actually reclaim the whole buffer in one attempt + * as the target watermark in should_continue_reclaim() is lower. But if + * we are already above the high+gap watermark, don't reclaim at all. + */ + watermark = high_wmark_pages(zone) + compact_gap(sc->order); + + return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); +} + +static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc) +{ + /* + * If reclaim is making progress greater than 12% efficiency then + * wake all the NOPROGRESS throttled tasks. + */ + if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) { + wait_queue_head_t *wqh; + + wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS]; + if (waitqueue_active(wqh)) + wake_up(wqh); + + return; + } + + /* + * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will + * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages + * under writeback and marked for immediate reclaim at the tail of the + * LRU. + */ + if (current_is_kswapd() || cgroup_reclaim(sc)) + return; + + /* Throttle if making no progress at high prioities. */ + if (sc->priority == 1 && !sc->nr_reclaimed) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS); +} + +/* + * This is the direct reclaim path, for page-allocating processes. We only + * try to reclaim pages from zones which will satisfy the caller's allocation + * request. + * + * If a zone is deemed to be full of pinned pages then just give it a light + * scan then give up on it. + */ +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + gfp_t orig_mask; + pg_data_t *last_pgdat = NULL; + pg_data_t *first_pgdat = NULL; + + /* + * If the number of buffer_heads in the machine exceeds the maximum + * allowed level, force direct reclaim to scan the highmem zone as + * highmem pages could be pinning lowmem pages storing buffer_heads + */ + orig_mask = sc->gfp_mask; + if (buffer_heads_over_limit) { + sc->gfp_mask |= __GFP_HIGHMEM; + sc->reclaim_idx = gfp_zone(sc->gfp_mask); + } + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + sc->reclaim_idx, sc->nodemask) { + /* + * Take care memory controller reclaiming has small influence + * to global LRU. + */ + if (!cgroup_reclaim(sc)) { + if (!cpuset_zone_allowed(zone, + GFP_KERNEL | __GFP_HARDWALL)) + continue; + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + compaction_ready(zone, sc)) { + sc->compaction_ready = true; + continue; + } + + /* + * Shrink each node in the zonelist once. If the + * zonelist is ordered by zone (not the default) then a + * node may be shrunk multiple times but in that case + * the user prefers lower zones being preserved. + */ + if (zone->zone_pgdat == last_pgdat) + continue; + + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + /* need some check for avoid more shrink_zone() */ + } + + if (!first_pgdat) + first_pgdat = zone->zone_pgdat; + + /* See comment about same check for global reclaim above */ + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + shrink_node(zone->zone_pgdat, sc); + } + + if (first_pgdat) + consider_reclaim_throttle(first_pgdat, sc); + + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; +} + +static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) +{ + struct lruvec *target_lruvec; + unsigned long refaults; + + if (lru_gen_enabled()) + return; + + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); + target_lruvec->refaults[WORKINGSET_ANON] = refaults; + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); + target_lruvec->refaults[WORKINGSET_FILE] = refaults; +} + +/* + * This is the main entry point to direct page reclaim. + * + * If a full scan of the inactive list fails to free enough memory then we + * are "out of memory" and something needs to be killed. + * + * If the caller is !__GFP_FS then the probability of a failure is reasonably + * high - the zone may be full of dirty or under-writeback pages, which this + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. + * + * returns: 0, if no pages reclaimed + * else, the number of pages reclaimed + */ +static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + struct scan_control *sc) +{ + int initial_priority = sc->priority; + pg_data_t *last_pgdat; + struct zoneref *z; + struct zone *zone; +retry: + delayacct_freepages_start(); + + if (!cgroup_reclaim(sc)) + __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); + + do { + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); + sc->nr_scanned = 0; + shrink_zones(zonelist, sc); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) + break; + + if (sc->compaction_ready) + break; + + /* + * If we're getting trouble reclaiming, start doing + * writepage even in laptop mode. + */ + if (sc->priority < DEF_PRIORITY - 2) + sc->may_writepage = 1; + } while (--sc->priority >= 0); + + last_pgdat = NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, + sc->nodemask) { + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + + if (cgroup_reclaim(sc)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, + zone->zone_pgdat); + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + } + } + + delayacct_freepages_end(); + + if (sc->nr_reclaimed) + return sc->nr_reclaimed; + + /* Aborted reclaim to try compaction? don't OOM, then */ + if (sc->compaction_ready) + return 1; + + /* + * We make inactive:active ratio decisions based on the node's + * composition of memory, but a restrictive reclaim_idx or a + * memory.low cgroup setting can exempt large amounts of + * memory from reclaim. Neither of which are very common, so + * instead of doing costly eligibility calculations of the + * entire cgroup subtree up front, we assume the estimates are + * good, and retry with forcible deactivation if that fails. + */ + if (sc->skipped_deactivate) { + sc->priority = initial_priority; + sc->force_deactivate = 1; + sc->skipped_deactivate = 0; + goto retry; + } + + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (sc->memcg_low_skipped) { + sc->priority = initial_priority; + sc->force_deactivate = 0; + sc->memcg_low_reclaim = 1; + sc->memcg_low_skipped = 0; + goto retry; + } + + return 0; +} + +static bool allow_direct_reclaim(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) + continue; + + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL); + + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached. + * + * Returns true if a fatal signal was delivered during throttling. If this + * happens, the page allocator should not consider triggering the OOM killer. + */ +static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zoneref *z; + struct zone *zone; + pg_data_t *pgdat = NULL; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + goto out; + + /* + * If a fatal signal is pending, this process should not throttle. + * It should return quickly so it can exit and free its memory + */ + if (fatal_signal_pending(current)) + goto out; + + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (allow_direct_reclaim(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) + goto out; + + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + allow_direct_reclaim(pgdat)); + + if (fatal_signal_pending(current)) + return true; + +out: + return false; +} + +unsigned long try_to_free_pages(struct zonelist *zonelist, int order, + gfp_t gfp_mask, nodemask_t *nodemask) +{ + unsigned long nr_reclaimed; + struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .gfp_mask = current_gfp_context(gfp_mask), + .reclaim_idx = gfp_zone(gfp_mask), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + }; + + /* + * scan_control uses s8 fields for order, priority, and reclaim_idx. + * Confirm they are large enough for max values. + */ + BUILD_BUG_ON(MAX_ORDER > S8_MAX); + BUILD_BUG_ON(DEF_PRIORITY > S8_MAX); + BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX); + + /* + * Do not enter reclaim if fatal signal was delivered while throttled. + * 1 is returned so that the page allocator does not OOM kill at this + * point. + */ + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) + return 1; + + set_task_reclaim_state(current, &sc.reclaim_state); + trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + set_task_reclaim_state(current, NULL); + + return nr_reclaimed; +} + +#ifdef CONFIG_MEMCG + +/* Only used by soft limit reclaim. Do not reuse for anything else. */ +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, + gfp_t gfp_mask, bool noswap, + pg_data_t *pgdat, + unsigned long *nr_scanned) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + .may_swap = !noswap, + }; + + WARN_ON_ONCE(!current->reclaim_state); + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, + sc.gfp_mask); + + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_node from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_lruvec(lruvec, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + + *nr_scanned = sc.nr_scanned; + + return sc.nr_reclaimed; +} + +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options) +{ + unsigned long nr_reclaimed; + unsigned int noreclaim_flag; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .reclaim_idx = MAX_NR_ZONES - 1, + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + }; + /* + * Traverse the ZONELIST_FALLBACK zonelist of the current node to put + * equal pressure on all the nodes. This is based on the assumption that + * the reclaim does not bail out early. + */ + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + + set_task_reclaim_state(current, &sc.reclaim_state); + trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + memalloc_noreclaim_restore(noreclaim_flag); + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + set_task_reclaim_state(current, NULL); + + return nr_reclaimed; +} +#endif + +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + struct lruvec *lruvec; + + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + + if (!can_age_anon_pages(pgdat, sc)) + return; + + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + return; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + lruvec = mem_cgroup_lruvec(memcg, pgdat); + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + memcg = mem_cgroup_iter(NULL, memcg, NULL); + } while (memcg); +} + +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx) +{ + int i; + struct zone *zone; + + /* + * Check for watermark boosts top-down as the higher zones + * are more likely to be boosted. Both watermarks and boosts + * should not be checked at the same time as reclaim would + * start prematurely when there is no boosting and a lower + * zone is balanced. + */ + for (i = highest_zoneidx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + if (zone->watermark_boost) + return true; + } + + return false; +} + +/* + * Returns true if there is an eligible zone balanced for the request order + * and highest_zoneidx + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) +{ + int i; + unsigned long mark = -1; + struct zone *zone; + + /* + * Check watermarks bottom-up as lower zones are more likely to + * meet watermarks. + */ + for (i = 0; i <= highest_zoneidx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + mark = wmark_pages(zone, WMARK_PROMO); + else + mark = high_wmark_pages(zone); + if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx)) + return true; + } + + /* + * If a node has no managed zone within highest_zoneidx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. + */ + if (mark == -1) + return true; + + return false; +} + +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); +} + +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, + int highest_zoneidx) +{ + /* + * The throttled processes are normally woken up in balance_pgdat() as + * soon as allow_direct_reclaim() is true. But there is a potential + * race between when kswapd checks the watermarks and a process gets + * throttled. There is also a potential race if processes get + * throttled, kswapd wakes, a large process exits thereby balancing the + * zones, which causes kswapd to exit balance_pgdat() before reaching + * the wake up checks. If kswapd is going to sleep, no process should + * be sleeping on pfmemalloc_wait, so wake them now if necessary. If + * the wake up is premature, processes will wake kswapd and get + * throttled again. The difference from wake ups in balance_pgdat() is + * that here we are under prepare_to_wait(). + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) + wake_up_all(&pgdat->pfmemalloc_wait); + + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + + if (pgdat_balanced(pgdat, order, highest_zoneidx)) { + clear_pgdat_congested(pgdat); + return true; + } + + return false; +} + +/* + * kswapd shrinks a node of pages that are at or below the highest usable + * zone that is currently unbalanced. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_node(pg_data_t *pgdat, + struct scan_control *sc) +{ + struct zone *zone; + int z; + + /* Reclaim a number of pages proportional to the number of zones */ + sc->nr_to_reclaim = 0; + for (z = 0; z <= sc->reclaim_idx; z++) { + zone = pgdat->node_zones + z; + if (!managed_zone(zone)) + continue; + + sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); + } + + /* + * Historically care was taken to put equal pressure on all zones but + * now pressure is applied based on node LRU order. + */ + shrink_node(pgdat, sc); + + /* + * Fragmentation may mean that the system cannot be rebalanced for + * high-order allocations. If twice the allocation size has been + * reclaimed then recheck watermarks only at order-0 to prevent + * excessive reclaim. Assume that a process requested a high-order + * can direct reclaim/compact. + */ + if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order)) + sc->order = 0; + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +/* Page allocator PCP high watermark is lowered if reclaim is active. */ +static inline void +update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) +{ + int i; + struct zone *zone; + + for (i = 0; i <= highest_zoneidx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + if (active) + set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + else + clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); + } +} + +static inline void +set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, true); +} + +static inline void +clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx) +{ + update_reclaim_active(pgdat, highest_zoneidx, false); +} + +/* + * For kswapd, balance_pgdat() will reclaim pages across a node from zones + * that are eligible for use by the caller until at least one zone is + * balanced. + * + * Returns the order kswapd finished reclaiming at. + * + * kswapd scans the zones in the highmem->normal->dma direction. It skips + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), any page in that zone + * or lower is eligible for reclaim until at least one usable zone is + * balanced. + */ +static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) +{ + int i; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + unsigned long pflags; + unsigned long nr_boost_reclaim; + unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; + bool boosted; + struct zone *zone; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = order, + .may_unmap = 1, + }; + + set_task_reclaim_state(current, &sc.reclaim_state); + psi_memstall_enter(&pflags); + __fs_reclaim_acquire(_THIS_IP_); + + count_vm_event(PAGEOUTRUN); + + /* + * Account for the reclaim boost. Note that the zone boost is left in + * place so that parallel allocations that are near the watermark will + * stall or direct reclaim until kswapd is finished. + */ + nr_boost_reclaim = 0; + for (i = 0; i <= highest_zoneidx; i++) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + nr_boost_reclaim += zone->watermark_boost; + zone_boosts[i] = zone->watermark_boost; + } + boosted = nr_boost_reclaim; + +restart: + set_reclaim_active(pgdat, highest_zoneidx); + sc.priority = DEF_PRIORITY; + do { + unsigned long nr_reclaimed = sc.nr_reclaimed; + bool raise_priority = true; + bool balanced; + bool ret; + + sc.reclaim_idx = highest_zoneidx; + + /* + * If the number of buffer_heads exceeds the maximum allowed + * then consider reclaiming from all zones. This has a dual + * purpose -- on 64-bit systems it is expected that + * buffer_heads are stripped during active rotation. On 32-bit + * systems, highmem pages can pin lowmem memory and shrinking + * buffers can relieve lowmem pressure. Reclaim may still not + * go ahead if all eligible zones for the original allocation + * request are balanced to avoid excessive reclaim from kswapd. + */ + if (buffer_heads_over_limit) { + for (i = MAX_NR_ZONES - 1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + sc.reclaim_idx = i; + break; + } + } + + /* + * If the pgdat is imbalanced then ignore boosting and preserve + * the watermarks for a later time and restart. Note that the + * zone watermarks will be still reset at the end of balancing + * on the grounds that the normal reclaim should be enough to + * re-evaluate if boosting is required when kswapd next wakes. + */ + balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx); + if (!balanced && nr_boost_reclaim) { + nr_boost_reclaim = 0; + goto restart; + } + + /* + * If boosting is not active then only reclaim if there are no + * eligible zones. Note that sc.reclaim_idx is not used as + * buffer_heads_over_limit may have adjusted it. + */ + if (!nr_boost_reclaim && balanced) + goto out; + + /* Limit the priority of boosting to avoid reclaim writeback */ + if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) + raise_priority = false; + + /* + * Do not writeback or swap pages for boosted reclaim. The + * intent is to relieve pressure not issue sub-optimal IO + * from reclaim context. If no pages are reclaimed, the + * reclaim will be aborted. + */ + sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_swap = !nr_boost_reclaim; + + /* + * Do some background aging, to give pages a chance to be + * referenced before reclaiming. All pages are rotated + * regardless of classzone as this is about consistent aging. + */ + kswapd_age_node(pgdat, &sc); + + /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* Call soft limit reclaim before calling shrink_node. */ + sc.nr_scanned = 0; + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, + sc.gfp_mask, &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + + /* + * There should be no need to raise the scanning priority if + * enough pages are already being scanned that that high + * watermark would be met at 100% efficiency. + */ + if (kswapd_shrink_node(pgdat, &sc)) + raise_priority = false; + + /* + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + allow_direct_reclaim(pgdat)) + wake_up_all(&pgdat->pfmemalloc_wait); + + /* Check if kswapd should be suspending */ + __fs_reclaim_release(_THIS_IP_); + ret = try_to_freeze(); + __fs_reclaim_acquire(_THIS_IP_); + if (ret || kthread_should_stop()) + break; + + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); + + /* + * If reclaim made no progress for a boost, stop reclaim as + * IO cannot be queued and it could be an infinite loop in + * extreme circumstances. + */ + if (nr_boost_reclaim && !nr_reclaimed) + break; + + if (raise_priority || !nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1); + + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + +out: + clear_reclaim_active(pgdat, highest_zoneidx); + + /* If reclaim was boosted, account for the reclaim done in this pass */ + if (boosted) { + unsigned long flags; + + for (i = 0; i <= highest_zoneidx; i++) { + if (!zone_boosts[i]) + continue; + + /* Increments are under the zone lock */ + zone = pgdat->node_zones + i; + spin_lock_irqsave(&zone->lock, flags); + zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* + * As there is now likely space, wakeup kcompact to defragment + * pageblocks. + */ + wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx); + } + + snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(_THIS_IP_); + psi_memstall_leave(&pflags); + set_task_reclaim_state(current, NULL); + + /* + * Return the order kswapd stopped reclaiming at as + * prepare_kswapd_sleep() takes it into account. If another caller + * entered the allocator slow path while kswapd was awake, order will + * remain at the higher level. + */ + return sc.order; +} + +/* + * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to + * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is + * not a valid index then either kswapd runs for first time or kswapd couldn't + * sleep after previous reclaim attempt (node is still unbalanced). In that + * case return the zone index of the previous kswapd reclaim cycle. + */ +static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat, + enum zone_type prev_highest_zoneidx) +{ + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); + + return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, + unsigned int highest_zoneidx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ + if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx); + + remaining = schedule_timeout(HZ/10); + + /* + * If woken prematurely then reset kswapd_highest_zoneidx and + * order. The values will either be from a wakeup request or + * the previous request that slept prematurely. + */ + if (remaining) { + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, + kswapd_highest_zoneidx(pgdat, + highest_zoneidx)); + + if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) + WRITE_ONCE(pgdat->kswapd_order, reclaim_order); + } + + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!remaining && + prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + + if (!kthread_should_stop()) + schedule(); + + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); +} + +/* + * The background pageout daemon, started as a kernel thread + * from the init process. + * + * This basically trickles out pages so that we have _some_ + * free memory available even if there is no other activity + * that frees anything up. This is needed for things like routing + * etc, where we otherwise might have all activity going on in + * asynchronous contexts that cannot page things out. + * + * If there are applications that are active memory-allocators + * (most normal use), this basically shouldn't matter. + */ +static int kswapd(void *p) +{ + unsigned int alloc_order, reclaim_order; + unsigned int highest_zoneidx = MAX_NR_ZONES - 1; + pg_data_t *pgdat = (pg_data_t *)p; + struct task_struct *tsk = current; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + /* + * Tell the memory management that we're a "memory allocator", + * and that if we need more memory we should get access to it + * regardless (see "__alloc_pages()"). "kswapd" should + * never get caught in the normal page freeing logic. + * + * (Kswapd normally doesn't need memory anyway, but sometimes + * you need a small amount of memory in order to be able to + * page out something else, and this flag essentially protects + * us from recursively trying to free more memory as we're + * trying to free the first piece of memory in the first place). + */ + tsk->flags |= PF_MEMALLOC | PF_KSWAPD; + set_freezable(); + + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + atomic_set(&pgdat->nr_writeback_throttled, 0); + for ( ; ; ) { + bool ret; + + alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); + +kswapd_try_sleep: + kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, + highest_zoneidx); + + /* Read the new order and highest_zoneidx */ + alloc_order = READ_ONCE(pgdat->kswapd_order); + highest_zoneidx = kswapd_highest_zoneidx(pgdat, + highest_zoneidx); + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES); + + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (ret) + continue; + + /* + * Reclaim begins at the requested order but if a high-order + * reclaim fails then kswapd falls back to reclaiming for + * order-0. If that happens, kswapd will consider sleeping + * for the order it finished reclaiming at (reclaim_order) + * but kcompactd is woken to compact for the original + * request (alloc_order). + */ + trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, + alloc_order); + reclaim_order = balance_pgdat(pgdat, alloc_order, + highest_zoneidx); + if (reclaim_order < alloc_order) + goto kswapd_try_sleep; + } + + tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD); + + return 0; +} + +/* + * A zone is low on free memory or too fragmented for high-order memory. If + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim + * has failed or is not needed, still wake up kcompactd if only compaction is + * needed. + */ +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, + enum zone_type highest_zoneidx) +{ + pg_data_t *pgdat; + enum zone_type curr_idx; + + if (!managed_zone(zone)) + return; + + if (!cpuset_zone_allowed(zone, gfp_flags)) + return; + + pgdat = zone->zone_pgdat; + curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx); + + if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx) + WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx); + + if (READ_ONCE(pgdat->kswapd_order) < order) + WRITE_ONCE(pgdat->kswapd_order, order); + + if (!waitqueue_active(&pgdat->kswapd_wait)) + return; + + /* Hopeless node, leave it to direct reclaim if possible */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + (pgdat_balanced(pgdat, order, highest_zoneidx) && + !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { + /* + * There may be plenty of free memory available, but it's too + * fragmented for high-order allocations. Wake up kcompactd + * and rely on compaction_suitable() to determine if it's + * needed. If it fails, it will defer subsequent attempts to + * ratelimit its work. + */ + if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) + wakeup_kcompactd(pgdat, order, highest_zoneidx); + return; + } + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order, + gfp_flags); + wake_up_interruptible(&pgdat->kswapd_wait); +} + +#ifdef CONFIG_HIBERNATION +/* + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of + * freed pages. + * + * Rather than trying to age LRUs the aim is to preserve the overall + * LRU order by reclaiming preferentially + * inactive > active > active referenced > active mapped + */ +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) +{ + struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .reclaim_idx = MAX_NR_ZONES - 1, + .priority = DEF_PRIORITY, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .hibernation_mode = 1, + }; + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + unsigned long nr_reclaimed; + unsigned int noreclaim_flag; + + fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); + set_task_reclaim_state(current, &sc.reclaim_state); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + set_task_reclaim_state(current, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); + + return nr_reclaimed; +} +#endif /* CONFIG_HIBERNATION */ + +/* + * This kswapd start function will be called by init and node-hot-add. + */ +void kswapd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + pgdat_kswapd_lock(pgdat); + if (!pgdat->kswapd) { + pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + if (IS_ERR(pgdat->kswapd)) { + /* failure at boot is fatal */ + BUG_ON(system_state < SYSTEM_RUNNING); + pr_err("Failed to start kswapd on node %d\n", nid); + pgdat->kswapd = NULL; + } + } + pgdat_kswapd_unlock(pgdat); +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * be holding mem_hotplug_begin/done(). + */ +void kswapd_stop(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + struct task_struct *kswapd; + + pgdat_kswapd_lock(pgdat); + kswapd = pgdat->kswapd; + if (kswapd) { + kthread_stop(kswapd); + pgdat->kswapd = NULL; + } + pgdat_kswapd_unlock(pgdat); +} + +static int __init kswapd_init(void) +{ + int nid; + + swap_setup(); + for_each_node_state(nid, N_MEMORY) + kswapd_run(nid); + return 0; +} + +module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Node reclaim mode + * + * If non-zero call node_reclaim when the number of free pages falls below + * the watermarks. + */ +int node_reclaim_mode __read_mostly; + +/* + * Priority for NODE_RECLAIM. This determines the fraction of pages + * of a node considered for each zone_reclaim. 4 scans 1/16th of + * a zone. + */ +#define NODE_RECLAIM_PRIORITY 4 + +/* + * Percentage of pages in a zone that must be unmapped for node_reclaim to + * occur. + */ +int sysctl_min_unmapped_ratio = 1; + +/* + * If the number of slab pages in a zone grows beyond this percentage then + * slab reclaim needs to occur. + */ +int sysctl_min_slab_ratio = 5; + +static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) +{ + unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); + unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) +{ + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; + + /* + * If RECLAIM_UNMAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and node_unmapped_file_pages() provides + * a better estimate + */ + if (node_reclaim_mode & RECLAIM_UNMAP) + nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(node_reclaim_mode & RECLAIM_WRITE)) + delta += node_page_state(pgdat, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + +/* + * Try to free up some pages from this node through reclaim. + */ +static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +{ + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; + struct task_struct *p = current; + unsigned int noreclaim_flag; + struct scan_control sc = { + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), + .gfp_mask = current_gfp_context(gfp_mask), + .order = order, + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, + .reclaim_idx = gfp_zone(gfp_mask), + }; + unsigned long pflags; + + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, + sc.gfp_mask); + + cond_resched(); + psi_memstall_enter(&pflags); + fs_reclaim_acquire(sc.gfp_mask); + /* + * We need to be able to allocate from the reserves for RECLAIM_UNMAP + */ + noreclaim_flag = memalloc_noreclaim_save(); + set_task_reclaim_state(p, &sc.reclaim_state); + + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages || + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) { + /* + * Free memory by calling shrink node with increasing + * priorities until we have enough memory freed. + */ + do { + shrink_node(pgdat, &sc); + } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); + } + + set_task_reclaim_state(p, NULL); + memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); + psi_memstall_leave(&pflags); + + trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + + return sc.nr_reclaimed >= nr_pages; +} + +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) +{ + int ret; + + /* + * Node reclaim reclaims unmapped file backed pages and + * slab pages if we are over the defined limits. + * + * A small portion of unmapped file backed pages is needed for + * file I/O otherwise pages read by file I/O will be immediately + * thrown out if the node is overallocated. So we do not reclaim + * if less than a specified percentage of the node is used by + * unmapped file backed pages. + */ + if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && + node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <= + pgdat->min_slab_pages) + return NODE_RECLAIM_FULL; + + /* + * Do not scan if the allocation should not be delayed. + */ + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) + return NODE_RECLAIM_NOSCAN; + + /* + * Only run node reclaim on the local node or on nodes that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) + return NODE_RECLAIM_NOSCAN; + + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return NODE_RECLAIM_NOSCAN; + + ret = __node_reclaim(pgdat, gfp_mask, order); + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); + + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + + return ret; +} +#endif + +void check_move_unevictable_pages(struct pagevec *pvec) +{ + struct folio_batch fbatch; + unsigned i; + + folio_batch_init(&fbatch); + for (i = 0; i < pvec->nr; i++) { + struct page *page = pvec->pages[i]; + + if (PageTransTail(page)) + continue; + folio_batch_add(&fbatch, page_folio(page)); + } + check_move_unevictable_folios(&fbatch); +} +EXPORT_SYMBOL_GPL(check_move_unevictable_pages); + +/** + * check_move_unevictable_folios - Move evictable folios to appropriate zone + * lru list + * @fbatch: Batch of lru folios to check. + * + * Checks folios for evictability, if an evictable folio is in the unevictable + * lru list, moves it to the appropriate evictable lru list. This function + * should be only used for lru folios. + */ +void check_move_unevictable_folios(struct folio_batch *fbatch) +{ + struct lruvec *lruvec = NULL; + int pgscanned = 0; + int pgrescued = 0; + int i; + + for (i = 0; i < fbatch->nr; i++) { + struct folio *folio = fbatch->folios[i]; + int nr_pages = folio_nr_pages(folio); + + pgscanned += nr_pages; + + /* block memcg migration while the folio moves between lrus */ + if (!folio_test_clear_lru(folio)) + continue; + + lruvec = folio_lruvec_relock_irq(folio, lruvec); + if (folio_evictable(folio) && folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_unevictable(folio); + lruvec_add_folio(lruvec, folio); + pgrescued += nr_pages; + } + folio_set_lru(folio); + } + + if (lruvec) { + __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); + __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); + unlock_page_lruvec_irq(lruvec); + } else if (pgscanned) { + count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); + } +} +EXPORT_SYMBOL_GPL(check_move_unevictable_folios); diff --git a/mm/vmstat.c b/mm/vmstat.c new file mode 100644 index 000000000..b2371d745 --- /dev/null +++ b/mm/vmstat.c @@ -0,0 +1,2251 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/mm/vmstat.c + * + * Manages VM statistics + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * zoned VM statistics + * Copyright (C) 2006 Silicon Graphics, Inc., + * Christoph Lameter + * Copyright (C) 2008-2014 Christoph Lameter + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#ifdef CONFIG_NUMA +int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; + +/* zero numa counters within a zone */ +static void zero_zone_numa_counters(struct zone *zone) +{ + int item, cpu; + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_event[item], 0); + for_each_online_cpu(cpu) { + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_event[item] + = 0; + } + } +} + +/* zero numa counters of all the populated zones */ +static void zero_zones_numa_counters(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zero_zone_numa_counters(zone); +} + +/* zero global numa counters */ +static void zero_global_numa_counters(void) +{ + int item; + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + atomic_long_set(&vm_numa_event[item], 0); +} + +static void invalid_numa_statistics(void) +{ + zero_zones_numa_counters(); + zero_global_numa_counters(); +} + +static DEFINE_MUTEX(vm_numa_stat_lock); + +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos) +{ + int ret, oldval; + + mutex_lock(&vm_numa_stat_lock); + if (write) + oldval = sysctl_vm_numa_stat; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + + if (oldval == sysctl_vm_numa_stat) + goto out; + else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { + static_branch_enable(&vm_numa_stat_key); + pr_info("enable numa statistics\n"); + } else { + static_branch_disable(&vm_numa_stat_key); + invalid_numa_statistics(); + pr_info("disable numa statistics, and clear numa counters\n"); + } + +out: + mutex_unlock(&vm_numa_stat_lock); + return ret; +} +#endif + +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret) +{ + int cpu; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + for_each_online_cpu(cpu) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} + +/* + * Accumulate the vm event counters across all CPUs. + * The result is unavoidably approximate - it can change + * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + cpus_read_lock(); + sum_vm_events(ret); + cpus_read_unlock(); +} +EXPORT_SYMBOL_GPL(all_vm_events); + +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +/* + * Manage combined zone based / global counters + * + * vm_stat contains the global counters + */ +atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; +atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp; +EXPORT_SYMBOL(vm_zone_stat); +EXPORT_SYMBOL(vm_node_stat); + +#ifdef CONFIG_NUMA +static void fold_vm_zone_numa_events(struct zone *zone) +{ + unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, }; + int cpu; + enum numa_stat_item item; + + for_each_online_cpu(cpu) { + struct per_cpu_zonestat *pzstats; + + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0); + } + + for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++) + zone_numa_event_add(zone_numa_events[item], zone, item); +} + +void fold_vm_numa_events(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + fold_vm_zone_numa_events(zone); +} +#endif + +#ifdef CONFIG_SMP + +int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +int calculate_normal_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem)+1 + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +/* + * Refresh the thresholds for each zone. + */ +void refresh_zone_stat_thresholds(void) +{ + struct pglist_data *pgdat; + struct zone *zone; + int cpu; + int threshold; + + /* Zero current pgdat thresholds */ + for_each_online_pgdat(pgdat) { + for_each_online_cpu(cpu) { + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0; + } + } + + for_each_populated_zone(zone) { + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long max_drift, tolerate_drift; + + threshold = calculate_normal_threshold(zone); + + for_each_online_cpu(cpu) { + int pgdat_threshold; + + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold + = threshold; + + /* Base nodestat threshold on the largest populated zone. */ + pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; + per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold + = max(threshold, pgdat_threshold); + } + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; + } +} + +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = (*calculate_pressure)(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->per_cpu_zonestats, cpu)->stat_threshold + = threshold; + } +} + +/* + * For use when we know that interrupts are disabled, + * or when we know that preemption is disabled and that + * particular counter cannot be updated from interrupt context. + */ +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + long delta) +{ + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; + s8 __percpu *p = pcp->vm_stat_diff + item; + long x; + long t; + + /* + * Accurate vmstat updates require a RMW. On !PREEMPT_RT kernels, + * atomicity is provided by IRQs being disabled -- either explicitly + * or via local_lock_irq. On PREEMPT_RT, local_lock_irq only disables + * CPU migrations and preemption potentially corrupts a counter so + * disable preemption. + */ + preempt_disable_nested(); + + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(abs(x) > t)) { + zone_page_state_add(x, zone, item); + x = 0; + } + __this_cpu_write(*p, x); + + preempt_enable_nested(); +} +EXPORT_SYMBOL(__mod_zone_page_state); + +void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long x; + long t; + + if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + + /* See __mod_node_page_state */ + preempt_disable_nested(); + + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(abs(x) > t)) { + node_page_state_add(x, pgdat, item); + x = 0; + } + __this_cpu_write(*p, x); + + preempt_enable_nested(); +} +EXPORT_SYMBOL(__mod_node_page_state); + +/* + * Optimized increment and decrement functions. + * + * These are only for a single page and therefore can take a struct page * + * argument instead of struct zone *. This allows the inclusion of the code + * generated for page_zone(page) into the optimized functions. + * + * No overflow check is necessary and therefore the differential can be + * incremented or decremented in place which may allow the compilers to + * generate better code. + * The increment or decrement is known and therefore one boundary check can + * be omitted. + * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * + * Some processors have inc/dec instructions that are atomic vs an interrupt. + * However, the code must first determine the differential location in a zone + * based on the processor number and then inc/dec the counter. There is no + * guarantee without disabling preemption that the processor will not change + * in between and therefore the atomicity vs. interrupt cannot be exploited + * in a useful way here. + */ +void __inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + + /* See __mod_node_page_state */ + preempt_disable_nested(); + + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); + } + + preempt_enable_nested(); +} + +void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + /* See __mod_node_page_state */ + preempt_disable_nested(); + + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; + + node_page_state_add(v + overstep, pgdat, item); + __this_cpu_write(*p, -overstep); + } + + preempt_enable_nested(); +} + +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __inc_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__inc_zone_page_state); + +void __inc_node_page_state(struct page *page, enum node_stat_item item) +{ + __inc_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__inc_node_page_state); + +void __dec_zone_state(struct zone *zone, enum zone_stat_item item) +{ + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + + /* See __mod_node_page_state */ + preempt_disable_nested(); + + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; + + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); + } + + preempt_enable_nested(); +} + +void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + s8 v, t; + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + /* See __mod_node_page_state */ + preempt_disable_nested(); + + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; + + node_page_state_add(v - overstep, pgdat, item); + __this_cpu_write(*p, overstep); + } + + preempt_enable_nested(); +} + +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + __dec_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__dec_zone_page_state); + +void __dec_node_page_state(struct page *page, enum node_stat_item item) +{ + __dec_node_state(page_pgdat(page), item); +} +EXPORT_SYMBOL(__dec_node_page_state); + +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL +/* + * If we have cmpxchg_local support then we do not need to incur the overhead + * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * + * mod_state() modifies the zone counter state through atomic per cpu + * operations. + * + * Overstep mode specifies how overstep should handled: + * 0 No overstepping + * 1 Overstepping half of threshold + * -1 Overstepping minus half of threshold +*/ +static inline void mod_zone_state(struct zone *zone, + enum zone_stat_item item, long delta, int overstep_mode) +{ + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; + s8 __percpu *p = pcp->vm_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to zone counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a zone. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (abs(n) > t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to zone counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + zone_page_state_add(z, zone, item); +} + +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + long delta) +{ + mod_zone_state(zone, item, delta, 0); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_zone_state(page_zone(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_zone_state(page_zone(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_zone_page_state); + +static inline void mod_node_state(struct pglist_data *pgdat, + enum node_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; + s8 __percpu *p = pcp->vm_node_stat_diff + item; + long o, n, t, z; + + if (vmstat_item_in_bytes(item)) { + /* + * Only cgroups use subpage accounting right now; at + * the global level, these items still change in + * multiples of whole pages. Store them as pages + * internally to keep the per-cpu counters compact. + */ + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + + do { + z = 0; /* overflow to node counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a node. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (abs(n) > t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to node counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + node_page_state_add(z, pgdat, item); +} + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + mod_node_state(pgdat, item, delta, 0); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + mod_node_state(pgdat, item, 1, 1); +} + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + mod_node_state(page_pgdat(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_node_page_state); +#else +/* + * Use interrupt disable to serialize counter updates + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + long delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + struct zone *zone; + + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_zone_page_state); + +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_state); + +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + long delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_node_page_state(pgdat, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_node_page_state); + +void inc_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + struct pglist_data *pgdat; + + pgdat = page_pgdat(page); + local_irq_save(flags); + __inc_node_state(pgdat, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(inc_node_page_state); + +void dec_node_page_state(struct page *page, enum node_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __dec_node_page_state(page, item); + local_irq_restore(flags); +} +EXPORT_SYMBOL(dec_node_page_state); +#endif + +/* + * Fold a differential into the global counters. + * Returns the number of counters updated. + */ +static int fold_diff(int *zone_diff, int *node_diff) +{ + int i; + int changes = 0; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (zone_diff[i]) { + atomic_long_add(zone_diff[i], &vm_zone_stat[i]); + changes++; + } + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (node_diff[i]) { + atomic_long_add(node_diff[i], &vm_node_stat[i]); + changes++; + } + return changes; +} + +/* + * Update the zone counters for the current cpu. + * + * Note that refresh_cpu_vm_stats strives to only access + * node local memory. The per cpu pagesets on remote zones are placed + * in the memory local to the processor using that pageset. So the + * loop over all zones will access a series of cachelines local to + * the processor. + * + * The call to zone_page_state_add updates the cachelines with the + * statistics in the remote zone struct as well as the global cachelines + * with the global counters. These could cause remote node cache line + * bouncing and will have to be only done when necessary. + * + * The function returns the number of global counters updated. + */ +static int refresh_cpu_vm_stats(bool do_pagesets) +{ + struct pglist_data *pgdat; + struct zone *zone; + int i; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; + int changes = 0; + + for_each_populated_zone(zone) { + struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; +#ifdef CONFIG_NUMA + struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; +#endif + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(pzstats->vm_stat_diff[i], 0); + if (v) { + + atomic_long_add(v, &zone->vm_stat[i]); + global_zone_diff[i] += v; +#ifdef CONFIG_NUMA + /* 3 seconds idle till flush */ + __this_cpu_write(pcp->expire, 3); +#endif + } + } +#ifdef CONFIG_NUMA + + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(pcp->expire) || + !__this_cpu_read(pcp->count)) + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(pcp->expire, 0); + continue; + } + + if (__this_cpu_dec_return(pcp->expire)) + continue; + + if (__this_cpu_read(pcp->count)) { + drain_zone_pages(zone, this_cpu_ptr(pcp)); + changes++; + } + } +#endif + } + + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats; + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + int v; + + v = this_cpu_xchg(p->vm_node_stat_diff[i], 0); + if (v) { + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + } + + changes += fold_diff(global_zone_diff, global_node_diff); + return changes; +} + +/* + * Fold the data for an offline cpu into the global array. + * There cannot be any access by the offline cpu and therefore + * synchronization is simplified. + */ +void cpu_vm_stats_fold(int cpu) +{ + struct pglist_data *pgdat; + struct zone *zone; + int i; + int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; + + for_each_populated_zone(zone) { + struct per_cpu_zonestat *pzstats; + + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + if (pzstats->vm_stat_diff[i]) { + int v; + + v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + global_zone_diff[i] += v; + } + } +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + unsigned long v; + + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); + } + } +#endif + } + + for_each_online_pgdat(pgdat) { + struct per_cpu_nodestat *p; + + p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + if (p->vm_node_stat_diff[i]) { + int v; + + v = p->vm_node_stat_diff[i]; + p->vm_node_stat_diff[i] = 0; + atomic_long_add(v, &pgdat->vm_stat[i]); + global_node_diff[i] += v; + } + } + + fold_diff(global_zone_diff, global_node_diff); +} + +/* + * this is only called if !populated_zone(zone), which implies no other users of + * pset->vm_stat_diff[] exist. + */ +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *pzstats) +{ + unsigned long v; + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + if (pzstats->vm_stat_diff[i]) { + v = pzstats->vm_stat_diff[i]; + pzstats->vm_stat_diff[i] = 0; + zone_page_state_add(v, zone, i); + } + } + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) { + if (pzstats->vm_numa_event[i]) { + v = pzstats->vm_numa_event[i]; + pzstats->vm_numa_event[i] = 0; + zone_numa_event_add(v, zone, i); + } + } +#endif +} +#endif + +#ifdef CONFIG_NUMA +/* + * Determine the per node value of a stat item. This function + * is called frequently in a NUMA machine, so try to be as + * frugal as possible. + */ +unsigned long sum_zone_node_page_state(int node, + enum zone_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; + + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_page_state(zones + i, item); + + return count; +} + +/* Determine the per node value of a numa stat item. */ +unsigned long sum_zone_numa_event_state(int node, + enum numa_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + unsigned long count = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_numa_event_state(zones + i, item); + + return count; +} + +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state_pages(struct pglist_data *pgdat, + enum node_stat_item item) +{ + long x = atomic_long_read(&pgdat->vm_stat[item]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + return node_page_state_pages(pgdat, item); +} +#endif + +#ifdef CONFIG_COMPACTION + +struct contig_page_info { + unsigned long free_pages; + unsigned long free_blocks_total; + unsigned long free_blocks_suitable; +}; + +/* + * Calculate the number of free pages in a zone, how many contiguous + * pages are free and how many are large enough to satisfy an allocation of + * the target size. Note that this function makes no attempt to estimate + * how many suitable free blocks there *might* be if MOVABLE pages were + * migrated. Calculating that is possible, but expensive and can be + * figured out from userspace + */ +static void fill_contig_page_info(struct zone *zone, + unsigned int suitable_order, + struct contig_page_info *info) +{ + unsigned int order; + + info->free_pages = 0; + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + + for (order = 0; order < MAX_ORDER; order++) { + unsigned long blocks; + + /* + * Count number of free blocks. + * + * Access to nr_free is lockless as nr_free is used only for + * diagnostic purposes. Use data_race to avoid KCSAN warning. + */ + blocks = data_race(zone->free_area[order].nr_free); + info->free_blocks_total += blocks; + + /* Count free base pages */ + info->free_pages += blocks << order; + + /* Count the suitable free blocks */ + if (order >= suitable_order) + info->free_blocks_suitable += blocks << + (order - suitable_order); + } +} + +/* + * A fragmentation index only makes sense if an allocation of a requested + * size would fail. If that is true, the fragmentation index indicates + * whether external fragmentation or a lack of memory was the problem. + * The value can be used to determine if page reclaim or compaction + * should be used + */ +static int __fragmentation_index(unsigned int order, struct contig_page_info *info) +{ + unsigned long requested = 1UL << order; + + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return 0; + + if (!info->free_blocks_total) + return 0; + + /* Fragmentation index only makes sense when a request would fail */ + if (info->free_blocks_suitable) + return -1000; + + /* + * Index is between 0 and 1 so return within 3 decimal places + * + * 0 => allocation would fail due to lack of memory + * 1 => allocation would fail due to fragmentation + */ + return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); +} + +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +unsigned int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + +/* Same as __fragmentation index but allocs contig_page_info on stack */ +int fragmentation_index(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + return __fragmentation_index(order, &info); +} +#endif + +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || \ + defined(CONFIG_NUMA) || defined(CONFIG_MEMCG) +#ifdef CONFIG_ZONE_DMA +#define TEXT_FOR_DMA(xx) xx "_dma", +#else +#define TEXT_FOR_DMA(xx) +#endif + +#ifdef CONFIG_ZONE_DMA32 +#define TEXT_FOR_DMA32(xx) xx "_dma32", +#else +#define TEXT_FOR_DMA32(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#else +#define TEXT_FOR_HIGHMEM(xx) +#endif + +#ifdef CONFIG_ZONE_DEVICE +#define TEXT_FOR_DEVICE(xx) xx "_device", +#else +#define TEXT_FOR_DEVICE(xx) +#endif + +#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ + TEXT_FOR_HIGHMEM(xx) xx "_movable", \ + TEXT_FOR_DEVICE(xx) + +const char * const vmstat_text[] = { + /* enum zone_stat_item counters */ + "nr_free_pages", + "nr_zone_inactive_anon", + "nr_zone_active_anon", + "nr_zone_inactive_file", + "nr_zone_active_file", + "nr_zone_unevictable", + "nr_zone_write_pending", + "nr_mlock", + "nr_bounce", +#if IS_ENABLED(CONFIG_ZSMALLOC) + "nr_zspages", +#endif + "nr_free_cma", + + /* enum numa_stat_item counters */ +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif + + /* enum node_stat_item counters */ + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", + "nr_unevictable", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", + "nr_isolated_anon", + "nr_isolated_file", + "workingset_nodes", + "workingset_refault_anon", + "workingset_refault_file", + "workingset_activate_anon", + "workingset_activate_file", + "workingset_restore_anon", + "workingset_restore_file", + "workingset_nodereclaim", + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_dirty", + "nr_writeback", + "nr_writeback_temp", + "nr_shmem", + "nr_shmem_hugepages", + "nr_shmem_pmdmapped", + "nr_file_hugepages", + "nr_file_pmdmapped", + "nr_anon_transparent_hugepages", + "nr_vmscan_write", + "nr_vmscan_immediate_reclaim", + "nr_dirtied", + "nr_written", + "nr_throttled_written", + "nr_kernel_misc_reclaimable", + "nr_foll_pin_acquired", + "nr_foll_pin_released", + "nr_kernel_stack", +#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) + "nr_shadow_call_stack", +#endif + "nr_page_table_pages", + "nr_sec_page_table_pages", +#ifdef CONFIG_SWAP + "nr_swapcached", +#endif +#ifdef CONFIG_NUMA_BALANCING + "pgpromote_success", + "pgpromote_candidate", +#endif + + /* enum writeback_stat_item counters */ + "nr_dirty_threshold", + "nr_dirty_background_threshold", + +#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) + /* enum vm_event_item counters */ + "pgpgin", + "pgpgout", + "pswpin", + "pswpout", + + TEXTS_FOR_ZONES("pgalloc") + TEXTS_FOR_ZONES("allocstall") + TEXTS_FOR_ZONES("pgskip") + + "pgfree", + "pgactivate", + "pgdeactivate", + "pglazyfree", + + "pgfault", + "pgmajfault", + "pglazyfreed", + + "pgrefill", + "pgreuse", + "pgsteal_kswapd", + "pgsteal_direct", + "pgdemote_kswapd", + "pgdemote_direct", + "pgscan_kswapd", + "pgscan_direct", + "pgscan_direct_throttle", + "pgscan_anon", + "pgscan_file", + "pgsteal_anon", + "pgsteal_file", + +#ifdef CONFIG_NUMA + "zone_reclaim_failed", +#endif + "pginodesteal", + "slabs_scanned", + "kswapd_inodesteal", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "pageoutrun", + + "pgrotated", + + "drop_pagecache", + "drop_slab", + "oom_kill", + +#ifdef CONFIG_NUMA_BALANCING + "numa_pte_updates", + "numa_huge_pte_updates", + "numa_hint_faults", + "numa_hint_faults_local", + "numa_pages_migrated", +#endif +#ifdef CONFIG_MIGRATION + "pgmigrate_success", + "pgmigrate_fail", + "thp_migration_success", + "thp_migration_fail", + "thp_migration_split", +#endif +#ifdef CONFIG_COMPACTION + "compact_migrate_scanned", + "compact_free_scanned", + "compact_isolated", + "compact_stall", + "compact_fail", + "compact_success", + "compact_daemon_wake", + "compact_daemon_migrate_scanned", + "compact_daemon_free_scanned", +#endif + +#ifdef CONFIG_HUGETLB_PAGE + "htlb_buddy_alloc_success", + "htlb_buddy_alloc_fail", +#endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", +#endif + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "thp_fault_alloc", + "thp_fault_fallback", + "thp_fault_fallback_charge", + "thp_collapse_alloc", + "thp_collapse_alloc_failed", + "thp_file_alloc", + "thp_file_fallback", + "thp_file_fallback_charge", + "thp_file_mapped", + "thp_split_page", + "thp_split_page_failed", + "thp_deferred_split_page", + "thp_split_pmd", + "thp_scan_exceed_none_pte", + "thp_scan_exceed_swap_pte", + "thp_scan_exceed_share_pte", +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + "thp_split_pud", +#endif + "thp_zero_page_alloc", + "thp_zero_page_alloc_failed", + "thp_swpout", + "thp_swpout_fallback", +#endif +#ifdef CONFIG_MEMORY_BALLOON + "balloon_inflate", + "balloon_deflate", +#ifdef CONFIG_BALLOON_COMPACTION + "balloon_migrate", +#endif +#endif /* CONFIG_MEMORY_BALLOON */ +#ifdef CONFIG_DEBUG_TLBFLUSH + "nr_tlb_remote_flush", + "nr_tlb_remote_flush_received", + "nr_tlb_local_flush_all", + "nr_tlb_local_flush_one", +#endif /* CONFIG_DEBUG_TLBFLUSH */ + +#ifdef CONFIG_SWAP + "swap_ra", + "swap_ra_hit", +#ifdef CONFIG_KSM + "ksm_swpin_copy", +#endif +#endif +#ifdef CONFIG_KSM + "cow_ksm", +#endif +#ifdef CONFIG_ZSWAP + "zswpin", + "zswpout", +#endif +#ifdef CONFIG_X86 + "direct_map_level2_splits", + "direct_map_level3_splits", +#endif +#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ +}; +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ + +#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ + defined(CONFIG_PROC_FS) +static void *frag_start(struct seq_file *m, loff_t *pos) +{ + pg_data_t *pgdat; + loff_t node = *pos; + + for (pgdat = first_online_pgdat(); + pgdat && node; + pgdat = next_online_pgdat(pgdat)) + --node; + + return pgdat; +} + +static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + (*pos)++; + return next_online_pgdat(pgdat); +} + +static void frag_stop(struct seq_file *m, void *arg) +{ +} + +/* + * Walk zones in a node and print using a callback. + * If @assert_populated is true, only use callback for zones that are populated. + */ +static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool assert_populated, bool nolock, + void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) +{ + struct zone *zone; + struct zone *node_zones = pgdat->node_zones; + unsigned long flags; + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (assert_populated && !populated_zone(zone)) + continue; + + if (!nolock) + spin_lock_irqsave(&zone->lock, flags); + print(m, pgdat, zone); + if (!nolock) + spin_unlock_irqrestore(&zone->lock, flags); + } +} +#endif + +#ifdef CONFIG_PROC_FS +static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int order; + + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (order = 0; order < MAX_ORDER; ++order) + /* + * Access to nr_free is lockless as nr_free is used only for + * printing purposes. Use data_race to avoid KCSAN warning. + */ + seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free)); + seq_putc(m, '\n'); +} + +/* + * This walks the free areas for each zone. + */ +static int frag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, true, false, frag_show_print); + return 0; +} + +static void pagetypeinfo_showfree_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int order, mtype; + + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) { + seq_printf(m, "Node %4d, zone %8s, type %12s ", + pgdat->node_id, + zone->name, + migratetype_names[mtype]); + for (order = 0; order < MAX_ORDER; ++order) { + unsigned long freecount = 0; + struct free_area *area; + struct list_head *curr; + bool overflow = false; + + area = &(zone->free_area[order]); + + list_for_each(curr, &area->free_list[mtype]) { + /* + * Cap the free_list iteration because it might + * be really large and we are under a spinlock + * so a long time spent here could trigger a + * hard lockup detector. Anyway this is a + * debugging tool so knowing there is a handful + * of pages of this order should be more than + * sufficient. + */ + if (++freecount >= 100000) { + overflow = true; + break; + } + } + seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount); + spin_unlock_irq(&zone->lock); + cond_resched(); + spin_lock_irq(&zone->lock); + } + seq_putc(m, '\n'); + } +} + +/* Print out the free pages at each order for each migatetype */ +static void pagetypeinfo_showfree(struct seq_file *m, void *arg) +{ + int order; + pg_data_t *pgdat = (pg_data_t *)arg; + + /* Print header */ + seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6d ", order); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); +} + +static void pagetypeinfo_showblockcount_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + int mtype; + unsigned long pfn; + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); + unsigned long count[MIGRATE_TYPES] = { 0, }; + + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { + struct page *page; + + page = pfn_to_online_page(pfn); + if (!page) + continue; + + if (page_zone(page) != zone) + continue; + + mtype = get_pageblock_migratetype(page); + + if (mtype < MIGRATE_TYPES) + count[mtype]++; + } + + /* Print counts */ + seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12lu ", count[mtype]); + seq_putc(m, '\n'); +} + +/* Print out the number of pageblocks for each migratetype */ +static void pagetypeinfo_showblockcount(struct seq_file *m, void *arg) +{ + int mtype; + pg_data_t *pgdat = (pg_data_t *)arg; + + seq_printf(m, "\n%-23s", "Number of blocks type "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + walk_zones_in_node(m, pgdat, true, false, + pagetypeinfo_showblockcount_print); +} + +/* + * Print out the number of pageblocks for each migratetype that contain pages + * of other types. This gives an indication of how well fallbacks are being + * contained by rmqueue_fallback(). It requires information from PAGE_OWNER + * to determine what is going on + */ +static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) +{ +#ifdef CONFIG_PAGE_OWNER + int mtype; + + if (!static_branch_unlikely(&page_owner_inited)) + return; + + drain_all_pages(NULL); + + seq_printf(m, "\n%-23s", "Number of mixed blocks "); + for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) + seq_printf(m, "%12s ", migratetype_names[mtype]); + seq_putc(m, '\n'); + + walk_zones_in_node(m, pgdat, true, true, + pagetypeinfo_showmixedcount_print); +#endif /* CONFIG_PAGE_OWNER */ +} + +/* + * This prints out statistics in relation to grouping pages by mobility. + * It is expensive to collect so do not constantly read the file. + */ +static int pagetypeinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_MEMORY)) + return 0; + + seq_printf(m, "Page block order: %d\n", pageblock_order); + seq_printf(m, "Pages per block: %lu\n", pageblock_nr_pages); + seq_putc(m, '\n'); + pagetypeinfo_showfree(m, pgdat); + pagetypeinfo_showblockcount(m, pgdat); + pagetypeinfo_showmixedcount(m, pgdat); + + return 0; +} + +static const struct seq_operations fragmentation_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = frag_show, +}; + +static const struct seq_operations pagetypeinfo_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = pagetypeinfo_show, +}; + +static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *compare = &pgdat->node_zones[zid]; + + if (populated_zone(compare)) + return zone == compare; + } + + return false; +} + +static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, + struct zone *zone) +{ + int i; + seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + if (is_zone_first_populated(pgdat, zone)) { + seq_printf(m, "\n per-node stats"); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; + seq_printf(m, "\n %-12s %lu", node_stat_name(i), + pages); + } + } + seq_printf(m, + "\n pages free %lu" + "\n boost %lu" + "\n min %lu" + "\n low %lu" + "\n high %lu" + "\n spanned %lu" + "\n present %lu" + "\n managed %lu" + "\n cma %lu", + zone_page_state(zone, NR_FREE_PAGES), + zone->watermark_boost, + min_wmark_pages(zone), + low_wmark_pages(zone), + high_wmark_pages(zone), + zone->spanned_pages, + zone->present_pages, + zone_managed_pages(zone), + zone_cma_pages(zone)); + + seq_printf(m, + "\n protection: (%ld", + zone->lowmem_reserve[0]); + for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); + seq_putc(m, ')'); + + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", zone_stat_name(i), + zone_page_state(zone, i)); + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", numa_stat_name(i), + zone_numa_event_state(zone, i)); +#endif + + seq_printf(m, "\n pagesets"); + for_each_online_cpu(i) { + struct per_cpu_pages *pcp; + struct per_cpu_zonestat __maybe_unused *pzstats; + + pcp = per_cpu_ptr(zone->per_cpu_pageset, i); + seq_printf(m, + "\n cpu: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, + pcp->count, + pcp->high, + pcp->batch); +#ifdef CONFIG_SMP + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); + seq_printf(m, "\n vm stats threshold: %d", + pzstats->stat_threshold); +#endif + } + seq_printf(m, + "\n node_unreclaimable: %u" + "\n start_pfn: %lu", + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, + zone->zone_start_pfn); + seq_putc(m, '\n'); +} + +/* + * Output information about zones in @pgdat. All zones are printed regardless + * of whether they are populated or not: lowmem_reserve_ratio operates on the + * set of all zones and userspace would not be aware of such zones if they are + * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). + */ +static int zoneinfo_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print); + return 0; +} + +static const struct seq_operations zoneinfo_op = { + .start = frag_start, /* iterate over all zones. The same as in + * fragmentation. */ + .next = frag_next, + .stop = frag_stop, + .show = zoneinfo_show, +}; + +#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \ + NR_VM_NUMA_EVENT_ITEMS + \ + NR_VM_NODE_STAT_ITEMS + \ + NR_VM_WRITEBACK_STAT_ITEMS + \ + (IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \ + NR_VM_EVENT_ITEMS : 0)) + +static void *vmstat_start(struct seq_file *m, loff_t *pos) +{ + unsigned long *v; + int i; + + if (*pos >= NR_VMSTAT_ITEMS) + return NULL; + + BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS); + fold_vm_numa_events(); + v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL); + m->private = v; + if (!v) + return ERR_PTR(-ENOMEM); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_zone_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) + v[i] = global_numa_event_state(i); + v += NR_VM_NUMA_EVENT_ITEMS; +#endif + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + v[i] = global_node_page_state_pages(i); + if (vmstat_item_print_in_thp(i)) + v[i] /= HPAGE_PMD_NR; + } + v += NR_VM_NODE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + +#ifdef CONFIG_VM_EVENT_COUNTERS + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; +#endif + return (unsigned long *)m->private + *pos; +} + +static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) +{ + (*pos)++; + if (*pos >= NR_VMSTAT_ITEMS) + return NULL; + return (unsigned long *)m->private + *pos; +} + +static int vmstat_show(struct seq_file *m, void *arg) +{ + unsigned long *l = arg; + unsigned long off = l - (unsigned long *)m->private; + + seq_puts(m, vmstat_text[off]); + seq_put_decimal_ull(m, " ", *l); + seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } + return 0; +} + +static void vmstat_stop(struct seq_file *m, void *arg) +{ + kfree(m->private); + m->private = NULL; +} + +static const struct seq_operations vmstat_op = { + .start = vmstat_start, + .next = vmstat_next, + .stop = vmstat_stop, + .show = vmstat_show, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct delayed_work, vmstat_work); +int sysctl_stat_interval __read_mostly = HZ; + +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_zone_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_ZONE_WRITE_PENDING: + case NR_FREE_CMA_PAGES: + continue; + } + val = atomic_long_read(&vm_zone_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, zone_stat_name(i), val); + } + } + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_WRITEBACK: + continue; + } + val = atomic_long_read(&vm_node_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, node_stat_name(i), val); + } + } + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ + +static void vmstat_update(struct work_struct *w) +{ + if (refresh_cpu_vm_stats(true)) { + /* + * Counters were updated so we expect more updates + * to occur in the future. Keep on running the + * update worker thread. + */ + queue_delayed_work_on(smp_processor_id(), mm_percpu_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } +} + +/* + * Check if the diffs for a certain cpu indicate that + * an update is needed. + */ +static bool need_update(int cpu) +{ + pg_data_t *last_pgdat = NULL; + struct zone *zone; + + for_each_populated_zone(zone) { + struct per_cpu_zonestat *pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); + struct per_cpu_nodestat *n; + + /* + * The fast way of checking if there are any vmstat diffs. + */ + if (memchr_inv(pzstats->vm_stat_diff, 0, sizeof(pzstats->vm_stat_diff))) + return true; + + if (last_pgdat == zone->zone_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu); + if (memchr_inv(n->vm_node_stat_diff, 0, sizeof(n->vm_node_stat_diff))) + return true; + } + return false; +} + +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + if (!delayed_work_pending(this_cpu_ptr(&vmstat_work))) + return; + + if (!need_update(smp_processor_id())) + return; + + /* + * Just refresh counters and do not care about the pending delayed + * vmstat_update. It doesn't fire that often to matter and canceling + * it would be too expensive from this path. + * vmstat_shepherd will take care about that for us. + */ + refresh_cpu_vm_stats(false); +} + +/* + * Shepherd worker thread that checks the + * differentials of processors that have their worker + * threads for vm statistics updates disabled because of + * inactivity. + */ +static void vmstat_shepherd(struct work_struct *w); + +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); + +static void vmstat_shepherd(struct work_struct *w) +{ + int cpu; + + cpus_read_lock(); + /* Check processors whose vmstat worker threads have been disabled */ + for_each_online_cpu(cpu) { + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + + if (!delayed_work_pending(dw) && need_update(cpu)) + queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0); + + cond_resched(); + } + cpus_read_unlock(); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); +} + +static void __init start_shepherd_timer(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), + vmstat_update); + + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); +} + +static void __init init_cpu_node_state(void) +{ + int node; + + for_each_online_node(node) { + if (!cpumask_empty(cpumask_of_node(node))) + node_set_state(node, N_CPU); + } +} + +static int vmstat_cpu_online(unsigned int cpu) +{ + refresh_zone_stat_thresholds(); + + if (!node_state(cpu_to_node(cpu), N_CPU)) { + node_set_state(cpu_to_node(cpu), N_CPU); + } + + return 0; +} + +static int vmstat_cpu_down_prep(unsigned int cpu) +{ + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + return 0; +} + +static int vmstat_cpu_dead(unsigned int cpu) +{ + const struct cpumask *node_cpus; + int node; + + node = cpu_to_node(cpu); + + refresh_zone_stat_thresholds(); + node_cpus = cpumask_of_node(node); + if (!cpumask_empty(node_cpus)) + return 0; + + node_clear_state(node, N_CPU); + + return 0; +} + +#endif + +struct workqueue_struct *mm_percpu_wq; + +void __init init_mm_internals(void) +{ + int ret __maybe_unused; + + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); + +#ifdef CONFIG_SMP + ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", + NULL, vmstat_cpu_dead); + if (ret < 0) + pr_err("vmstat: failed to register 'dead' hotplug state\n"); + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/vmstat:online", + vmstat_cpu_online, + vmstat_cpu_down_prep); + if (ret < 0) + pr_err("vmstat: failed to register 'online' hotplug state\n"); + + cpus_read_lock(); + init_cpu_node_state(); + cpus_read_unlock(); + + start_shepherd_timer(); +#endif +#ifdef CONFIG_PROC_FS + proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); + proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); + proc_create_seq("vmstat", 0444, NULL, &vmstat_op); + proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op); +#endif +} + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) + +/* + * Return an index indicating how much of the available free memory is + * unusable for an allocation of the requested size. + */ +static int unusable_free_index(unsigned int order, + struct contig_page_info *info) +{ + /* No free memory is interpreted as all free memory is unusable */ + if (info->free_pages == 0) + return 1000; + + /* + * Index should be a value between 0 and 1. Return a value to 3 + * decimal places. + * + * 0 => no fragmentation + * 1 => high fragmentation + */ + return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); + +} + +static void unusable_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display unusable free space index + * + * The unusable free space index measures how much of the available free + * memory cannot be used to satisfy an allocation of a given size and is a + * value between 0 and 1. The higher the value, the more of free memory is + * unusable and by implication, the worse the external fragmentation is. This + * can be expressed as a percentage by multiplying by 100. + */ +static int unusable_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_MEMORY)) + return 0; + + walk_zones_in_node(m, pgdat, true, false, unusable_show_print); + + return 0; +} + +static const struct seq_operations unusable_sops = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = unusable_show, +}; + +DEFINE_SEQ_ATTRIBUTE(unusable); + +static void extfrag_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + + /* Alloc on stack as interrupts are disabled for zone walk */ + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = __fragmentation_index(order, &info); + seq_printf(m, "%2d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display fragmentation index for orders that allocations would fail for + */ +static int extfrag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, true, false, extfrag_show_print); + + return 0; +} + +static const struct seq_operations extfrag_sops = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = extfrag_show, +}; + +DEFINE_SEQ_ATTRIBUTE(extfrag); + +static int __init extfrag_debug_init(void) +{ + struct dentry *extfrag_debug_root; + + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); + + debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, + &unusable_fops); + + debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, + &extfrag_fops); + + return 0; +} + +module_init(extfrag_debug_init); +#endif diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 000000000..6e4699055 --- /dev/null +++ b/mm/workingset.c @@ -0,0 +1,748 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Double CLOCK lists + * + * Per node, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Refaulting inactive pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * That means if inactive cache is refaulting with a suitable refault + * distance, we assume the cache workingset is transitioning and put + * pressure on the current active list. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * Refaulting active pages + * + * If on the other hand the refaulting pages have recently been + * deactivated, it means that the active list is no longer protecting + * actively used cache from reclaim. The cache is NOT transitioning to + * a different workingset; the existing workingset is thrashing in the + * space allocated to the page cache. + * + * + * Implementation + * + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the node) is stored in the now empty page cache + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +#define WORKINGSET_SHIFT 1 +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ + WORKINGSET_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) +#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + +/* + * Eviction timestamps need to be able to cover the full range of + * actionable refaults. However, bits are tight in the xarray + * entry, and after storing the identifier for the lruvec there might + * not be enough left to represent every single actionable refault. In + * that case, we have to sacrifice granularity for distance, and group + * evictions into coarser buckets by shaving off lower timestamp bits. + */ +static unsigned int bucket_order __read_mostly; + +static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, + bool workingset) +{ + eviction &= EVICTION_MASK; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; + eviction = (eviction << NODES_SHIFT) | pgdat->node_id; + eviction = (eviction << WORKINGSET_SHIFT) | workingset; + + return xa_mk_value(eviction); +} + +static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, + unsigned long *evictionp, bool *workingsetp) +{ + unsigned long entry = xa_to_value(shadow); + int memcgid, nid; + bool workingset; + + workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1); + entry >>= WORKINGSET_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + entry >>= MEM_CGROUP_ID_SHIFT; + + *memcgidp = memcgid; + *pgdat = NODE_DATA(nid); + *evictionp = entry; + *workingsetp = workingset; +} + +#ifdef CONFIG_LRU_GEN + +static void *lru_gen_eviction(struct folio *folio) +{ + int hist; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_struct *lrugen; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); + struct mem_cgroup *memcg = folio_memcg(folio); + struct pglist_data *pgdat = folio_pgdat(folio); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0); + + hist = lru_hist_from_seq(min_seq); + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); +} + +static void lru_gen_refault(struct folio *folio, void *shadow) +{ + int hist, tier, refs; + int memcg_id; + bool workingset; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lru_gen_struct *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = folio_is_file_lru(folio); + int delta = folio_nr_pages(folio); + + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); + + if (pgdat != folio_pgdat(folio)) + return; + + rcu_read_lock(); + + memcg = folio_memcg_rcu(folio); + if (memcg_id != mem_cgroup_id(memcg)) + goto unlock; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->lrugen; + + min_seq = READ_ONCE(lrugen->min_seq[type]); + if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + goto unlock; + + hist = lru_hist_from_seq(min_seq); + /* see the comment in folio_lru_refs() */ + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; + tier = lru_tier_from_refs(refs); + + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + + /* + * Count the following two cases as stalls: + * 1. For pages accessed through page tables, hotter pages pushed out + * hot pages which refaulted immediately. + * 2. For pages accessed multiple times through file descriptors, + * they would have been protected by sort_folio(). + */ + if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { + set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); + } +unlock: + rcu_read_unlock(); +} + +#else /* !CONFIG_LRU_GEN */ + +static void *lru_gen_eviction(struct folio *folio) +{ + return NULL; +} + +static void lru_gen_refault(struct folio *folio, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @lruvec: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) +{ + /* + * Reclaiming a cgroup means reclaiming all its children in a + * round-robin fashion. That means that each cgroup has an LRU + * order that is composed of the LRU orders of its child + * cgroups; and every page has an LRU position not just in the + * cgroup that owns it, but in all of that group's ancestors. + * + * So when the physical inactive list of a leaf cgroup ages, + * the virtual inactive lists of all its parents, including + * the root cgroup's, age as well. + */ + do { + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); +} + +/** + * workingset_eviction - note the eviction of a folio from memory + * @target_memcg: the cgroup that is causing the reclaim + * @folio: the folio being evicted + * + * Return: a shadow entry to be stored in @folio->mapping->i_pages in place + * of the evicted @folio so that a later refault can be detected. + */ +void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) +{ + struct pglist_data *pgdat = folio_pgdat(folio); + unsigned long eviction; + struct lruvec *lruvec; + int memcgid; + + /* Folio is fully exclusive and pins folio's memory cgroup pointer */ + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (lru_gen_enabled()) + return lru_gen_eviction(folio); + + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->nonresident_age); + eviction >>= bucket_order; + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + return pack_shadow(memcgid, pgdat, eviction, + folio_test_workingset(folio)); +} + +/** + * workingset_refault - Evaluate the refault of a previously evicted folio. + * @folio: The freshly allocated replacement folio. + * @shadow: Shadow entry of the evicted folio. + * + * Calculates and evaluates the refault distance of the previously + * evicted folio in the context of the node and the memcg whose memory + * pressure caused the eviction. + */ +void workingset_refault(struct folio *folio, void *shadow) +{ + bool file = folio_is_file_lru(folio); + struct mem_cgroup *eviction_memcg; + struct lruvec *eviction_lruvec; + unsigned long refault_distance; + unsigned long workingset_size; + struct pglist_data *pgdat; + struct mem_cgroup *memcg; + unsigned long eviction; + struct lruvec *lruvec; + unsigned long refault; + bool workingset; + int memcgid; + long nr; + + if (lru_gen_enabled()) { + lru_gen_refault(folio, shadow); + return; + } + + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + eviction <<= bucket_order; + + rcu_read_lock(); + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the folio's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared folio. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + * + * XXX: On !CONFIG_MEMCG, this will always return NULL; it + * would be better if the root_mem_cgroup existed in all + * configurations instead. + */ + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) + goto out; + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); + + /* + * Calculate the refault distance + * + * The unsigned subtraction here gives an accurate distance + * across nonresident_age overflows in most cases. There is a + * special case: usually, shadow entries have a short lifetime + * and are either refaulted or reclaimed along with the inode + * before they get too old. But it is not impossible for the + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate + * unconditionally with *every* reclaim invocation for the + * longest time, so the occasional inappropriate activation + * leading to pressure on the active list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; + + /* + * The activation decision for this folio is made at the level + * where the eviction occurred, as that is where the LRU order + * during folio reclaim is being determined. + * + * However, the cgroup that will own the folio is the one that + * is actually experiencing the refault event. + */ + nr = folio_nr_pages(folio); + memcg = folio_memcg(folio); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); + + mem_cgroup_flush_stats_delayed(); + /* + * Compare the distance to the existing workingset size. We + * don't activate pages that couldn't stay resident even if + * all the memory was available to the workingset. Whether + * workingset competition needs to consider anon or not depends + * on having swap. + */ + workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); + if (!file) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_FILE); + } + if (mem_cgroup_get_nr_swap_pages(memcg) > 0) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_ACTIVE_ANON); + if (file) { + workingset_size += lruvec_page_state(eviction_lruvec, + NR_INACTIVE_ANON); + } + } + if (refault_distance > workingset_size) + goto out; + + folio_set_active(folio); + workingset_age_nonresident(lruvec, nr); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); + + /* Folio was active prior to eviction */ + if (workingset) { + folio_set_workingset(folio); + /* XXX: Move to lru_cache_add() when it supports new vs putback */ + lru_note_cost_folio(folio); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); + } +out: + rcu_read_unlock(); +} + +/** + * workingset_activation - note a page activation + * @folio: Folio that is being activated. + */ +void workingset_activation(struct folio *folio) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + /* + * Filter non-memcg pages here, e.g. unmap can call + * mark_page_accessed() on VDSO pages. + * + * XXX: See workingset_refault() - this should return + * root_mem_cgroup even for !CONFIG_MEMCG. + */ + memcg = folio_memcg_rcu(folio); + if (!mem_cgroup_disabled() && !memcg) + goto out; + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); +out: + rcu_read_unlock(); +} + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru shadow_nodes; + +void workingset_update_node(struct xa_node *node) +{ + struct address_space *mapping; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by the i_pages lock. + */ + mapping = container_of(node->array, struct address_space, i_pages); + lockdep_assert_held(&mapping->i_pages.xa_lock); + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { + list_lru_add(&shadow_nodes, &node->private_list); + __inc_lruvec_kmem_state(node, WORKINGSET_NODES); + } + } else { + if (!list_empty(&node->private_list)) { + list_lru_del(&shadow_nodes, &node->private_list); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + } + } +} + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long max_nodes; + unsigned long nodes; + unsigned long pages; + + nodes = list_lru_shrink_count(&shadow_nodes, sc); + if (!nodes) + return SHRINK_EMPTY; + + /* + * Approximate a reasonable limit for the nodes + * containing shadow entries. We don't need to keep more + * shadow entries than possible pages on the active list, + * since refault distances bigger than that are dismissed. + * + * The size of the active list converges toward 100% of + * overall page cache as memory grows, with only a tiny + * inactive list. Assume the total cache size for that. + * + * Nodes might be sparsely populated, with only one shadow + * entry in the extreme case. Obviously, we cannot keep one + * node for every eligible shadow entry, so compromise on a + * worst-case density of 1/8th. Below that, not all eligible + * refaults can be detected anymore. + * + * On 64-bit with 7 xa_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~1.8% of available memory: + * + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE + */ +#ifdef CONFIG_MEMCG + if (sc->memcg) { + struct lruvec *lruvec; + int i; + + lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) + pages += lruvec_page_state_local(lruvec, + NR_LRU_BASE + i); + pages += lruvec_page_state_local( + lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT; + pages += lruvec_page_state_local( + lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; + } else +#endif + pages = node_present_pages(sc->nid); + + max_nodes = pages >> (XA_CHUNK_SHIFT - 3); + + if (nodes <= max_nodes) + return 0; + return nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lru_lock, + void *arg) __must_hold(lru_lock) +{ + struct xa_node *node = container_of(item, struct xa_node, private_list); + struct address_space *mapping; + int ret; + + /* + * Page cache insertions and deletions synchronously maintain + * the shadow node LRU under the i_pages lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has nodes on the LRU. + * + * We can then safely transition to the i_pages lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + mapping = container_of(node->array, struct address_space, i_pages); + + /* Coming from the list, invert the lock order */ + if (!xa_trylock(&mapping->i_pages)) { + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } + + if (!spin_trylock(&mapping->host->i_lock)) { + xa_unlock(&mapping->i_pages); + spin_unlock_irq(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_lru_isolate(lru, item); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); + + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + if (WARN_ON_ONCE(!node->nr_values)) + goto out_invalid; + if (WARN_ON_ONCE(node->count != node->nr_values)) + goto out_invalid; + xa_delete_node(node, workingset_update_node); + __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); + +out_invalid: + xa_unlock_irq(&mapping->i_pages); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); + ret = LRU_REMOVED_RETRY; +out: + cond_resched(); + spin_lock_irq(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ + return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate, + NULL); +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = 0, /* ->count reports only fully expendable nodes */ + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * i_pages lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + unsigned int timestamp_bits; + unsigned int max_order; + int ret; + + BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of + * memory (totalram_pages/2). However, memory hotplug may add + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + max_order = fls_long(totalram_pages() - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; + pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + timestamp_bits, max_order, bucket_order); + + ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); + if (ret) + goto err; + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, + &workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + register_shrinker_prepared(&workingset_shadow_shrinker); + return 0; +err_list_lru: + free_prealloced_shrinker(&workingset_shadow_shrinker); +err: + return ret; +} +module_init(workingset_init); diff --git a/mm/z3fold.c b/mm/z3fold.c new file mode 100644 index 000000000..cf71da10d --- /dev/null +++ b/mm/z3fold.c @@ -0,0 +1,1710 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * z3fold.c + * + * Author: Vitaly Wool + * Copyright (C) 2016, Sony Mobile Communications Inc. + * + * This implementation is based on zbud written by Seth Jennings. + * + * z3fold is an special purpose allocator for storing compressed pages. It + * can store up to three compressed pages per page which improves the + * compression ratio of zbud while retaining its main concepts (e. g. always + * storing an integral number of objects per page) and simplicity. + * It still has simple and deterministic reclaim properties that make it + * preferable to a higher density approach (with no requirement on integral + * number of object per page) when reclaim is used. + * + * As in zbud, pages are divided into "chunks". The size of the chunks is + * fixed at compile time and is determined by NCHUNKS_ORDER below. + * + * z3fold doesn't export any API and is meant to be used via zpool API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks + * in the beginning of an allocated page are occupied by z3fold header, so + * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), + * which shows the max number of free chunks in z3fold page, also there will + * be 63, or 62, respectively, freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) +#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) +#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS) + +#define BUDDY_MASK (0x3) +#define BUDDY_SHIFT 2 +#define SLOTS_ALIGN (0x40) + +/***************** + * Structures +*****************/ +struct z3fold_pool; +struct z3fold_ops { + int (*evict)(struct z3fold_pool *pool, unsigned long handle); +}; + +enum buddy { + HEADLESS = 0, + FIRST, + MIDDLE, + LAST, + BUDDIES_MAX = LAST +}; + +struct z3fold_buddy_slots { + /* + * we are using BUDDY_MASK in handle_to_buddy etc. so there should + * be enough slots to hold all possible variants + */ + unsigned long slot[BUDDY_MASK + 1]; + unsigned long pool; /* back link */ + rwlock_t lock; +}; +#define HANDLE_FLAG_MASK (0x03) + +/* + * struct z3fold_header - z3fold page metadata occupying first chunks of each + * z3fold page, except for HEADLESS pages + * @buddy: links the z3fold page into the relevant list in the + * pool + * @page_lock: per-page lock + * @refcount: reference count for the z3fold page + * @work: work_struct for page layout optimization + * @slots: pointer to the structure holding buddy slots + * @pool: pointer to the containing pool + * @cpu: CPU which this page "belongs" to + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @middle_chunks: the size of the middle buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + * @first_num: the starting number (for the first handle) + * @mapped_count: the number of objects currently mapped + */ +struct z3fold_header { + struct list_head buddy; + spinlock_t page_lock; + struct kref refcount; + struct work_struct work; + struct z3fold_buddy_slots *slots; + struct z3fold_pool *pool; + short cpu; + unsigned short first_chunks; + unsigned short middle_chunks; + unsigned short last_chunks; + unsigned short start_middle; + unsigned short first_num:2; + unsigned short mapped_count:2; + unsigned short foreign_handles:2; +}; + +/** + * struct z3fold_pool - stores metadata for each z3fold pool + * @name: pool name + * @lock: protects pool unbuddied/lru lists + * @stale_lock: protects pool stale page list + * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- + * buddies; the list each z3fold page is added to depends on + * the size of its free region. + * @lru: list tracking the z3fold pages in LRU order by most recently + * added buddy. + * @stale: list of pages marked for freeing + * @pages_nr: number of z3fold pages in the pool. + * @c_handle: cache for z3fold_buddy_slots allocation + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * @zpool: zpool driver + * @zpool_ops: zpool operations structure with an evict callback + * @compact_wq: workqueue for page layout background optimization + * @release_wq: workqueue for safe page release + * @work: work_struct for safe page release + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular z3fold pool. + */ +struct z3fold_pool { + const char *name; + spinlock_t lock; + spinlock_t stale_lock; + struct list_head *unbuddied; + struct list_head lru; + struct list_head stale; + atomic64_t pages_nr; + struct kmem_cache *c_handle; + const struct z3fold_ops *ops; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; + struct workqueue_struct *compact_wq; + struct workqueue_struct *release_wq; + struct work_struct work; +}; + +/* + * Internal z3fold page flags + */ +enum z3fold_page_flags { + PAGE_HEADLESS = 0, + MIDDLE_CHUNK_MAPPED, + NEEDS_COMPACTING, + PAGE_STALE, + PAGE_CLAIMED, /* by either reclaim or free */ + PAGE_MIGRATED, /* page is migrated and soon to be released */ +}; + +/* + * handle flags, go under HANDLE_FLAG_MASK + */ +enum z3fold_handle_flags { + HANDLES_NOFREE = 0, +}; + +/* + * Forward declarations + */ +static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); +static void compact_page_work(struct work_struct *w); + +/***************** + * Helpers +*****************/ + +/* Converts an allocation size in bytes to size in z3fold chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, + gfp_t gfp) +{ + struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle, + gfp); + + if (slots) { + /* It will be freed separately in free_handle(). */ + kmemleak_not_leak(slots); + slots->pool = (unsigned long)pool; + rwlock_init(&slots->lock); + } + + return slots; +} + +static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) +{ + return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); +} + +static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) +{ + return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); +} + +/* Lock a z3fold page */ +static inline void z3fold_page_lock(struct z3fold_header *zhdr) +{ + spin_lock(&zhdr->page_lock); +} + +/* Try to lock a z3fold page */ +static inline int z3fold_page_trylock(struct z3fold_header *zhdr) +{ + return spin_trylock(&zhdr->page_lock); +} + +/* Unlock a z3fold page */ +static inline void z3fold_page_unlock(struct z3fold_header *zhdr) +{ + spin_unlock(&zhdr->page_lock); +} + +/* return locked z3fold page if it's not headless */ +static inline struct z3fold_header *get_z3fold_header(unsigned long handle) +{ + struct z3fold_buddy_slots *slots; + struct z3fold_header *zhdr; + int locked = 0; + + if (!(handle & (1 << PAGE_HEADLESS))) { + slots = handle_to_slots(handle); + do { + unsigned long addr; + + read_lock(&slots->lock); + addr = *(unsigned long *)handle; + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + locked = z3fold_page_trylock(zhdr); + read_unlock(&slots->lock); + if (locked) { + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_MIGRATED, &page->private)) + break; + z3fold_page_unlock(zhdr); + } + cpu_relax(); + } while (true); + } else { + zhdr = (struct z3fold_header *)(handle & PAGE_MASK); + } + + return zhdr; +} + +static inline void put_z3fold_header(struct z3fold_header *zhdr) +{ + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_HEADLESS, &page->private)) + z3fold_page_unlock(zhdr); +} + +static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) +{ + struct z3fold_buddy_slots *slots; + int i; + bool is_free; + + if (WARN_ON(*(unsigned long *)handle == 0)) + return; + + slots = handle_to_slots(handle); + write_lock(&slots->lock); + *(unsigned long *)handle = 0; + + if (test_bit(HANDLES_NOFREE, &slots->pool)) { + write_unlock(&slots->lock); + return; /* simple case, nothing else to do */ + } + + if (zhdr->slots != slots) + zhdr->foreign_handles--; + + is_free = true; + for (i = 0; i <= BUDDY_MASK; i++) { + if (slots->slot[i]) { + is_free = false; + break; + } + } + write_unlock(&slots->lock); + + if (is_free) { + struct z3fold_pool *pool = slots_to_pool(slots); + + if (zhdr->slots == slots) + zhdr->slots = NULL; + kmem_cache_free(pool->c_handle, slots); + } +} + +/* Initializes the z3fold header of a newly allocated z3fold page */ +static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, + struct z3fold_pool *pool, gfp_t gfp) +{ + struct z3fold_header *zhdr = page_address(page); + struct z3fold_buddy_slots *slots; + + INIT_LIST_HEAD(&page->lru); + clear_bit(PAGE_HEADLESS, &page->private); + clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); + clear_bit(PAGE_STALE, &page->private); + clear_bit(PAGE_CLAIMED, &page->private); + clear_bit(PAGE_MIGRATED, &page->private); + if (headless) + return zhdr; + + slots = alloc_slots(pool, gfp); + if (!slots) + return NULL; + + memset(zhdr, 0, sizeof(*zhdr)); + spin_lock_init(&zhdr->page_lock); + kref_init(&zhdr->refcount); + zhdr->cpu = -1; + zhdr->slots = slots; + zhdr->pool = pool; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_WORK(&zhdr->work, compact_page_work); + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_z3fold_page(struct page *page, bool headless) +{ + if (!headless) { + lock_page(page); + __ClearPageMovable(page); + unlock_page(page); + } + __free_page(page); +} + +/* Helper function to build the index */ +static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) +{ + return (bud + zhdr->first_num) & BUDDY_MASK; +} + +/* + * Encodes the handle of a particular buddy within a z3fold page + * Pool lock should be held as this function accesses first_num + */ +static unsigned long __encode_handle(struct z3fold_header *zhdr, + struct z3fold_buddy_slots *slots, + enum buddy bud) +{ + unsigned long h = (unsigned long)zhdr; + int idx = 0; + + /* + * For a headless page, its handle is its pointer with the extra + * PAGE_HEADLESS bit set + */ + if (bud == HEADLESS) + return h | (1 << PAGE_HEADLESS); + + /* otherwise, return pointer to encoded handle */ + idx = __idx(zhdr, bud); + h += idx; + if (bud == LAST) + h |= (zhdr->last_chunks << BUDDY_SHIFT); + + write_lock(&slots->lock); + slots->slot[idx] = h; + write_unlock(&slots->lock); + return (unsigned long)&slots->slot[idx]; +} + +static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +{ + return __encode_handle(zhdr, zhdr->slots, bud); +} + +/* only for LAST bud, returns zero otherwise */ +static unsigned short handle_to_chunks(unsigned long handle) +{ + struct z3fold_buddy_slots *slots = handle_to_slots(handle); + unsigned long addr; + + read_lock(&slots->lock); + addr = *(unsigned long *)handle; + read_unlock(&slots->lock); + return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; +} + +/* + * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle + * but that doesn't matter. because the masking will result in the + * correct buddy number. + */ +static enum buddy handle_to_buddy(unsigned long handle) +{ + struct z3fold_header *zhdr; + struct z3fold_buddy_slots *slots = handle_to_slots(handle); + unsigned long addr; + + read_lock(&slots->lock); + WARN_ON(handle & (1 << PAGE_HEADLESS)); + addr = *(unsigned long *)handle; + read_unlock(&slots->lock); + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + return (addr - zhdr->first_num) & BUDDY_MASK; +} + +static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) +{ + return zhdr->pool; +} + +static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) +{ + struct page *page = virt_to_page(zhdr); + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + + WARN_ON(!list_empty(&zhdr->buddy)); + set_bit(PAGE_STALE, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del_init(&page->lru); + spin_unlock(&pool->lock); + + if (locked) + z3fold_page_unlock(zhdr); + + spin_lock(&pool->stale_lock); + list_add(&zhdr->buddy, &pool->stale); + queue_work(pool->release_wq, &pool->work); + spin_unlock(&pool->stale_lock); + + atomic64_dec(&pool->pages_nr); +} + +static void release_z3fold_page_locked(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void release_z3fold_page_locked_list(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + + spin_lock(&pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void free_pages_work(struct work_struct *w) +{ + struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); + + spin_lock(&pool->stale_lock); + while (!list_empty(&pool->stale)) { + struct z3fold_header *zhdr = list_first_entry(&pool->stale, + struct z3fold_header, buddy); + struct page *page = virt_to_page(zhdr); + + list_del(&zhdr->buddy); + if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) + continue; + spin_unlock(&pool->stale_lock); + cancel_work_sync(&zhdr->work); + free_z3fold_page(page, false); + cond_resched(); + spin_lock(&pool->stale_lock); + } + spin_unlock(&pool->stale_lock); +} + +/* + * Returns the number of free chunks in a z3fold page. + * NB: can't be used with HEADLESS pages. + */ +static int num_free_chunks(struct z3fold_header *zhdr) +{ + int nfree; + /* + * If there is a middle object, pick up the bigger free space + * either before or after it. Otherwise just subtract the number + * of chunks occupied by the first and the last objects. + */ + if (zhdr->middle_chunks != 0) { + int nfree_before = zhdr->first_chunks ? + 0 : zhdr->start_middle - ZHDR_CHUNKS; + int nfree_after = zhdr->last_chunks ? + 0 : TOTAL_CHUNKS - + (zhdr->start_middle + zhdr->middle_chunks); + nfree = max(nfree_before, nfree_after); + } else + nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; + return nfree; +} + +/* Add to the appropriate unbuddied list */ +static inline void add_to_unbuddied(struct z3fold_pool *pool, + struct z3fold_header *zhdr) +{ + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { + struct list_head *unbuddied; + int freechunks = num_free_chunks(zhdr); + + migrate_disable(); + unbuddied = this_cpu_ptr(pool->unbuddied); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + migrate_enable(); + } +} + +static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) +{ + enum buddy bud = HEADLESS; + + if (zhdr->middle_chunks) { + if (!zhdr->first_chunks && + chunks <= zhdr->start_middle - ZHDR_CHUNKS) + bud = FIRST; + else if (!zhdr->last_chunks) + bud = LAST; + } else { + if (!zhdr->first_chunks) + bud = FIRST; + else if (!zhdr->last_chunks) + bud = LAST; + else + bud = MIDDLE; + } + + return bud; +} + +static inline void *mchunk_memmove(struct z3fold_header *zhdr, + unsigned short dst_chunk) +{ + void *beg = zhdr; + return memmove(beg + (dst_chunk << CHUNK_SHIFT), + beg + (zhdr->start_middle << CHUNK_SHIFT), + zhdr->middle_chunks << CHUNK_SHIFT); +} + +static inline bool buddy_single(struct z3fold_header *zhdr) +{ + return !((zhdr->first_chunks && zhdr->middle_chunks) || + (zhdr->first_chunks && zhdr->last_chunks) || + (zhdr->middle_chunks && zhdr->last_chunks)); +} + +static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) +{ + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + void *p = zhdr; + unsigned long old_handle = 0; + size_t sz = 0; + struct z3fold_header *new_zhdr = NULL; + int first_idx = __idx(zhdr, FIRST); + int middle_idx = __idx(zhdr, MIDDLE); + int last_idx = __idx(zhdr, LAST); + unsigned short *moved_chunks = NULL; + + /* + * No need to protect slots here -- all the slots are "local" and + * the page lock is already taken + */ + if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { + p += ZHDR_SIZE_ALIGNED; + sz = zhdr->first_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; + moved_chunks = &zhdr->first_chunks; + } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { + p += zhdr->start_middle << CHUNK_SHIFT; + sz = zhdr->middle_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; + moved_chunks = &zhdr->middle_chunks; + } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { + p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + sz = zhdr->last_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; + moved_chunks = &zhdr->last_chunks; + } + + if (sz > 0) { + enum buddy new_bud = HEADLESS; + short chunks = size_to_chunks(sz); + void *q; + + new_zhdr = __z3fold_alloc(pool, sz, false); + if (!new_zhdr) + return NULL; + + if (WARN_ON(new_zhdr == zhdr)) + goto out_fail; + + new_bud = get_free_buddy(new_zhdr, chunks); + q = new_zhdr; + switch (new_bud) { + case FIRST: + new_zhdr->first_chunks = chunks; + q += ZHDR_SIZE_ALIGNED; + break; + case MIDDLE: + new_zhdr->middle_chunks = chunks; + new_zhdr->start_middle = + new_zhdr->first_chunks + ZHDR_CHUNKS; + q += new_zhdr->start_middle << CHUNK_SHIFT; + break; + case LAST: + new_zhdr->last_chunks = chunks; + q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); + break; + default: + goto out_fail; + } + new_zhdr->foreign_handles++; + memcpy(q, p, sz); + write_lock(&zhdr->slots->lock); + *(unsigned long *)old_handle = (unsigned long)new_zhdr + + __idx(new_zhdr, new_bud); + if (new_bud == LAST) + *(unsigned long *)old_handle |= + (new_zhdr->last_chunks << BUDDY_SHIFT); + write_unlock(&zhdr->slots->lock); + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); + + *moved_chunks = 0; + } + + return new_zhdr; + +out_fail: + if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) { + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); + } + return NULL; + +} + +#define BIG_CHUNK_GAP 3 +/* Has to be called with lock held */ +static int z3fold_compact_page(struct z3fold_header *zhdr) +{ + struct page *page = virt_to_page(zhdr); + + if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) + return 0; /* can't move middle chunk, it's used */ + + if (unlikely(PageIsolated(page))) + return 0; + + if (zhdr->middle_chunks == 0) + return 0; /* nothing to compact */ + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* move to the beginning */ + mchunk_memmove(zhdr, ZHDR_CHUNKS); + zhdr->first_chunks = zhdr->middle_chunks; + zhdr->middle_chunks = 0; + zhdr->start_middle = 0; + zhdr->first_num++; + return 1; + } + + /* + * moving data is expensive, so let's only do that if + * there's substantial gain (at least BIG_CHUNK_GAP chunks) + */ + if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && + zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= + BIG_CHUNK_GAP) { + mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); + zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; + return 1; + } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && + TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle + + zhdr->middle_chunks) >= + BIG_CHUNK_GAP) { + unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - + zhdr->middle_chunks; + mchunk_memmove(zhdr, new_start); + zhdr->start_middle = new_start; + return 1; + } + + return 0; +} + +static void do_compact_page(struct z3fold_header *zhdr, bool locked) +{ + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + struct page *page; + + page = virt_to_page(zhdr); + if (locked) + WARN_ON(z3fold_page_trylock(zhdr)); + else + z3fold_page_lock(zhdr); + if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { + z3fold_page_unlock(zhdr); + return; + } + spin_lock(&pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + return; + + if (test_bit(PAGE_STALE, &page->private) || + test_and_set_bit(PAGE_CLAIMED, &page->private)) { + z3fold_page_unlock(zhdr); + return; + } + + if (!zhdr->foreign_handles && buddy_single(zhdr) && + zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); + } + return; + } + + z3fold_compact_page(zhdr); + add_to_unbuddied(pool, zhdr); + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); +} + +static void compact_page_work(struct work_struct *w) +{ + struct z3fold_header *zhdr = container_of(w, struct z3fold_header, + work); + + do_compact_page(zhdr, false); +} + +/* returns _locked_ z3fold page header or NULL */ +static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + size_t size, bool can_sleep) +{ + struct z3fold_header *zhdr = NULL; + struct page *page; + struct list_head *unbuddied; + int chunks = size_to_chunks(size), i; + +lookup: + migrate_disable(); + /* First, try to find an unbuddied z3fold page. */ + unbuddied = this_cpu_ptr(pool->unbuddied); + for_each_unbuddied_list(i, chunks) { + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr) + continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + migrate_enable(); + if (can_sleep) + cond_resched(); + goto lookup; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + migrate_enable(); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + migrate_enable(); + + if (!zhdr) { + int cpu; + + /* look for _exact_ match on other cpus' lists */ + for_each_online_cpu(cpu) { + struct list_head *l; + + unbuddied = per_cpu_ptr(pool->unbuddied, cpu); + spin_lock(&pool->lock); + l = &unbuddied[chunks]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr || !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + continue; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + if (can_sleep) + cond_resched(); + continue; + } + kref_get(&zhdr->refcount); + break; + } + } + + if (zhdr && !zhdr->slots) { + zhdr->slots = alloc_slots(pool, GFP_ATOMIC); + if (!zhdr->slots) + goto out_fail; + } + return zhdr; + +out_fail: + if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + add_to_unbuddied(pool, zhdr); + z3fold_page_unlock(zhdr); + } + return NULL; +} + +/* + * API Functions + */ + +/** + * z3fold_create_pool() - create a new z3fold pool + * @name: pool name + * @gfp: gfp flags when allocating the z3fold pool structure + * @ops: user-defined operations for the z3fold pool + * + * Return: pointer to the new z3fold pool or NULL if the metadata allocation + * failed. + */ +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, + const struct z3fold_ops *ops) +{ + struct z3fold_pool *pool = NULL; + int i, cpu; + + pool = kzalloc(sizeof(struct z3fold_pool), gfp); + if (!pool) + goto out; + pool->c_handle = kmem_cache_create("z3fold_handle", + sizeof(struct z3fold_buddy_slots), + SLOTS_ALIGN, 0, NULL); + if (!pool->c_handle) + goto out_c; + spin_lock_init(&pool->lock); + spin_lock_init(&pool->stale_lock); + pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS, + __alignof__(struct list_head)); + if (!pool->unbuddied) + goto out_pool; + for_each_possible_cpu(cpu) { + struct list_head *unbuddied = + per_cpu_ptr(pool->unbuddied, cpu); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&unbuddied[i]); + } + INIT_LIST_HEAD(&pool->lru); + INIT_LIST_HEAD(&pool->stale); + atomic64_set(&pool->pages_nr, 0); + pool->name = name; + pool->compact_wq = create_singlethread_workqueue(pool->name); + if (!pool->compact_wq) + goto out_unbuddied; + pool->release_wq = create_singlethread_workqueue(pool->name); + if (!pool->release_wq) + goto out_wq; + INIT_WORK(&pool->work, free_pages_work); + pool->ops = ops; + return pool; + +out_wq: + destroy_workqueue(pool->compact_wq); +out_unbuddied: + free_percpu(pool->unbuddied); +out_pool: + kmem_cache_destroy(pool->c_handle); +out_c: + kfree(pool); +out: + return NULL; +} + +/** + * z3fold_destroy_pool() - destroys an existing z3fold pool + * @pool: the z3fold pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void z3fold_destroy_pool(struct z3fold_pool *pool) +{ + kmem_cache_destroy(pool->c_handle); + + /* + * We need to destroy pool->compact_wq before pool->release_wq, + * as any pending work on pool->compact_wq will call + * queue_work(pool->release_wq, &pool->work). + * + * There are still outstanding pages until both workqueues are drained, + * so we cannot unregister migration until then. + */ + + destroy_workqueue(pool->compact_wq); + destroy_workqueue(pool->release_wq); + free_percpu(pool->unbuddied); + kfree(pool); +} + +static const struct movable_operations z3fold_mops; + +/** + * z3fold_alloc() - allocates a region of a given size + * @pool: z3fold pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks = size_to_chunks(size); + struct z3fold_header *zhdr = NULL; + struct page *page = NULL; + enum buddy bud; + bool can_sleep = gfpflags_allow_blocking(gfp); + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + + if (size > PAGE_SIZE) + return -ENOSPC; + + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + bud = HEADLESS; + else { +retry: + zhdr = __z3fold_alloc(pool, size, can_sleep); + if (zhdr) { + bud = get_free_buddy(zhdr, chunks); + if (bud == HEADLESS) { + if (!kref_put(&zhdr->refcount, + release_z3fold_page_locked)) + z3fold_page_unlock(zhdr); + pr_err("No free chunks in unbuddied\n"); + WARN_ON(1); + goto retry; + } + page = virt_to_page(zhdr); + goto found; + } + bud = FIRST; + } + + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + + zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); + if (!zhdr) { + __free_page(page); + return -ENOMEM; + } + atomic64_inc(&pool->pages_nr); + + if (bud == HEADLESS) { + set_bit(PAGE_HEADLESS, &page->private); + goto headless; + } + if (can_sleep) { + lock_page(page); + __SetPageMovable(page, &z3fold_mops); + unlock_page(page); + } else { + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, &z3fold_mops); + unlock_page(page); + } + z3fold_page_lock(zhdr); + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else if (bud == LAST) + zhdr->last_chunks = chunks; + else { + zhdr->middle_chunks = chunks; + zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; + } + add_to_unbuddied(pool, zhdr); + +headless: + spin_lock(&pool->lock); + /* Add/move z3fold page to beginning of LRU */ + if (!list_empty(&page->lru)) + list_del(&page->lru); + + list_add(&page->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + if (bud != HEADLESS) + z3fold_page_unlock(zhdr); + + return 0; +} + +/** + * z3fold_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by z3fold_alloc() + * + * In the case that the z3fold page in which the allocation resides is under + * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function + * only sets the first|middle|last_chunks to 0. The page is actually freed + * once all buddies are evicted (see z3fold_reclaim_page() below). + */ +static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + enum buddy bud; + bool page_claimed; + + zhdr = get_z3fold_header(handle); + page = virt_to_page(zhdr); + page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); + + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* if a headless page is under reclaim, just leave. + * NB: we use test_and_set_bit for a reason: if the bit + * has not been set before, we release this page + * immediately so we don't care about its value any more. + */ + if (!page_claimed) { + spin_lock(&pool->lock); + list_del(&page->lru); + spin_unlock(&pool->lock); + put_z3fold_header(zhdr); + free_z3fold_page(page, true); + atomic64_dec(&pool->pages_nr); + } + return; + } + + /* Non-headless case */ + bud = handle_to_buddy(handle); + + switch (bud) { + case FIRST: + zhdr->first_chunks = 0; + break; + case MIDDLE: + zhdr->middle_chunks = 0; + break; + case LAST: + zhdr->last_chunks = 0; + break; + default: + pr_err("%s: unknown bud %d\n", __func__, bud); + WARN_ON(1); + put_z3fold_header(zhdr); + return; + } + + if (!page_claimed) + free_handle(handle, zhdr); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) + return; + if (page_claimed) { + /* the page has not been claimed by us */ + put_z3fold_header(zhdr); + return; + } + if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { + clear_bit(PAGE_CLAIMED, &page->private); + put_z3fold_header(zhdr); + return; + } + if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { + zhdr->cpu = -1; + kref_get(&zhdr->refcount); + clear_bit(PAGE_CLAIMED, &page->private); + do_compact_page(zhdr, true); + return; + } + kref_get(&zhdr->refcount); + clear_bit(PAGE_CLAIMED, &page->private); + queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + put_z3fold_header(zhdr); +} + +/** + * z3fold_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retries: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * z3fold reclaim is different from normal system reclaim in that it is done + * from the bottom, up. This is because only the bottom layer, z3fold, has + * information on how the allocations are organized within each z3fold page. + * This has the potential to create interesting locking situations between + * z3fold and the user, however. + * + * To avoid these, this is how z3fold_reclaim_page() should be called: + * + * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). + * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and + * call the user-defined eviction handler with the pool and handle as + * arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. z3fold_reclaim_page() will add the z3fold page back to the + * appropriate list and try the next z3fold page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the z3fold page are successfully evicted, then the + * z3fold page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) +{ + int i, ret = -1; + struct z3fold_header *zhdr = NULL; + struct page *page = NULL; + struct list_head *pos; + unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; + struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN))); + + rwlock_init(&slots.lock); + slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } + list_for_each_prev(pos, &pool->lru) { + page = list_entry(pos, struct page, lru); + + zhdr = page_address(page); + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* + * For non-headless pages, we wait to do this + * until we have the page lock to avoid racing + * with __z3fold_alloc(). Headless pages don't + * have a lock (and __z3fold_alloc() will never + * see them), but we still need to test and set + * PAGE_CLAIMED to avoid racing with + * z3fold_free(), so just do it now before + * leaving the loop. + */ + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + continue; + + break; + } + + if (!z3fold_page_trylock(zhdr)) { + zhdr = NULL; + continue; /* can't evict at this point */ + } + + /* test_and_set_bit is of course atomic, but we still + * need to do it under page lock, otherwise checking + * that bit in __z3fold_alloc wouldn't make sense + */ + if (zhdr->foreign_handles || + test_and_set_bit(PAGE_CLAIMED, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + continue; /* can't evict such page */ + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + /* See comment in __z3fold_alloc. */ + kref_get(&zhdr->refcount); + break; + } + + if (!zhdr) + break; + + list_del_init(&page->lru); + spin_unlock(&pool->lock); + + if (!test_bit(PAGE_HEADLESS, &page->private)) { + /* + * We need encode the handles before unlocking, and + * use our local slots structure because z3fold_free + * can zero out zhdr->slots and we can't do much + * about that + */ + first_handle = 0; + last_handle = 0; + middle_handle = 0; + memset(slots.slot, 0, sizeof(slots.slot)); + if (zhdr->first_chunks) + first_handle = __encode_handle(zhdr, &slots, + FIRST); + if (zhdr->middle_chunks) + middle_handle = __encode_handle(zhdr, &slots, + MIDDLE); + if (zhdr->last_chunks) + last_handle = __encode_handle(zhdr, &slots, + LAST); + /* + * it's safe to unlock here because we hold a + * reference to this page + */ + z3fold_page_unlock(zhdr); + } else { + first_handle = encode_handle(zhdr, HEADLESS); + last_handle = middle_handle = 0; + } + /* Issue the eviction callback(s) */ + if (middle_handle) { + ret = pool->ops->evict(pool, middle_handle); + if (ret) + goto next; + } + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + if (test_bit(PAGE_HEADLESS, &page->private)) { + if (ret == 0) { + free_z3fold_page(page, true); + atomic64_dec(&pool->pages_nr); + return 0; + } + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + clear_bit(PAGE_CLAIMED, &page->private); + } else { + struct z3fold_buddy_slots *slots = zhdr->slots; + z3fold_page_lock(zhdr); + if (kref_put(&zhdr->refcount, + release_z3fold_page_locked)) { + kmem_cache_free(pool->c_handle, slots); + return 0; + } + /* + * if we are here, the page is still not completely + * free. Take the global pool lock then to be able + * to add it back to the lru list + */ + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); + } + + /* We started off locked to we need to lock the pool back */ + spin_lock(&pool->lock); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * z3fold_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * Extracts the buddy number from handle and constructs the pointer to the + * correct starting chunk within the page. + * + * Returns: a pointer to the mapped allocation + */ +static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + void *addr; + enum buddy buddy; + + zhdr = get_z3fold_header(handle); + addr = zhdr; + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) + goto out; + + buddy = handle_to_buddy(handle); + switch (buddy) { + case FIRST: + addr += ZHDR_SIZE_ALIGNED; + break; + case MIDDLE: + addr += zhdr->start_middle << CHUNK_SHIFT; + set_bit(MIDDLE_CHUNK_MAPPED, &page->private); + break; + case LAST: + addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); + break; + default: + pr_err("unknown buddy id %d\n", buddy); + WARN_ON(1); + addr = NULL; + break; + } + + if (addr) + zhdr->mapped_count++; +out: + put_z3fold_header(zhdr); + return addr; +} + +/** + * z3fold_unmap() - unmaps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + enum buddy buddy; + + zhdr = get_z3fold_header(handle); + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) + return; + + buddy = handle_to_buddy(handle); + if (buddy == MIDDLE) + clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + zhdr->mapped_count--; + put_z3fold_header(zhdr); +} + +/** + * z3fold_get_pool_size() - gets the z3fold pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. + */ +static u64 z3fold_get_pool_size(struct z3fold_pool *pool) +{ + return atomic64_read(&pool->pages_nr); +} + +static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + if (test_bit(PAGE_HEADLESS, &page->private)) + return false; + + zhdr = page_address(page); + z3fold_page_lock(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_STALE, &page->private)) + goto out; + + if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) + goto out; + + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + goto out; + pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + if (!list_empty(&page->lru)) + list_del_init(&page->lru); + spin_unlock(&pool->lock); + + kref_get(&zhdr->refcount); + z3fold_page_unlock(zhdr); + return true; + +out: + z3fold_page_unlock(zhdr); + return false; +} + +static int z3fold_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + struct z3fold_header *zhdr, *new_zhdr; + struct z3fold_pool *pool; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + if (!z3fold_page_trylock(zhdr)) + return -EAGAIN; + if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); + return -EBUSY; + } + if (work_pending(&zhdr->work)) { + z3fold_page_unlock(zhdr); + return -EAGAIN; + } + new_zhdr = page_address(newpage); + memcpy(new_zhdr, zhdr, PAGE_SIZE); + newpage->private = page->private; + set_bit(PAGE_MIGRATED, &page->private); + z3fold_page_unlock(zhdr); + spin_lock_init(&new_zhdr->page_lock); + INIT_WORK(&new_zhdr->work, compact_page_work); + /* + * z3fold_page_isolate() ensures that new_zhdr->buddy is empty, + * so we only have to reinitialize it. + */ + INIT_LIST_HEAD(&new_zhdr->buddy); + __ClearPageMovable(page); + + get_page(newpage); + z3fold_page_lock(new_zhdr); + if (new_zhdr->first_chunks) + encode_handle(new_zhdr, FIRST); + if (new_zhdr->last_chunks) + encode_handle(new_zhdr, LAST); + if (new_zhdr->middle_chunks) + encode_handle(new_zhdr, MIDDLE); + set_bit(NEEDS_COMPACTING, &newpage->private); + new_zhdr->cpu = smp_processor_id(); + spin_lock(&pool->lock); + list_add(&newpage->lru, &pool->lru); + spin_unlock(&pool->lock); + __SetPageMovable(newpage, &z3fold_mops); + z3fold_page_unlock(new_zhdr); + + queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + + /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */ + page->private = 0; + put_page(page); + return 0; +} + +static void z3fold_page_putback(struct page *page) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + z3fold_page_lock(zhdr); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + INIT_LIST_HEAD(&page->lru); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + return; + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + if (list_empty(&zhdr->buddy)) + add_to_unbuddied(pool, zhdr); + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); +} + +static const struct movable_operations z3fold_mops = { + .isolate_page = z3fold_page_isolate, + .migrate_page = z3fold_page_migrate, + .putback_page = z3fold_page_putback, +}; + +/***************** + * zpool + ****************/ + +static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct z3fold_ops z3fold_zpool_ops = { + .evict = z3fold_zpool_evict +}; + +static void *z3fold_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct z3fold_pool *pool; + + pool = z3fold_create_pool(name, gfp, + zpool_ops ? &z3fold_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void z3fold_zpool_destroy(void *pool) +{ + z3fold_destroy_pool(pool); +} + +static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return z3fold_alloc(pool, size, gfp, handle); +} +static void z3fold_zpool_free(void *pool, unsigned long handle) +{ + z3fold_free(pool, handle); +} + +static int z3fold_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = z3fold_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *z3fold_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return z3fold_map(pool, handle); +} +static void z3fold_zpool_unmap(void *pool, unsigned long handle) +{ + z3fold_unmap(pool, handle); +} + +static u64 z3fold_zpool_total_size(void *pool) +{ + return z3fold_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver z3fold_zpool_driver = { + .type = "z3fold", + .sleep_mapped = true, + .owner = THIS_MODULE, + .create = z3fold_zpool_create, + .destroy = z3fold_zpool_destroy, + .malloc = z3fold_zpool_malloc, + .free = z3fold_zpool_free, + .shrink = z3fold_zpool_shrink, + .map = z3fold_zpool_map, + .unmap = z3fold_zpool_unmap, + .total_size = z3fold_zpool_total_size, +}; + +MODULE_ALIAS("zpool-z3fold"); + +static int __init init_z3fold(void) +{ + /* + * Make sure the z3fold header is not larger than the page size and + * there has remaining spaces for its buddy. + */ + BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); + zpool_register_driver(&z3fold_zpool_driver); + + return 0; +} + +static void __exit exit_z3fold(void) +{ + zpool_unregister_driver(&z3fold_zpool_driver); +} + +module_init(init_z3fold); +module_exit(exit_z3fold); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vitaly Wool "); +MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000..634893243 --- /dev/null +++ b/mm/zbud.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justified" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by zbud header, NCHUNKS will be calculated to + * 63 which shows the max number of free chunks in zbud page, also there will be + * 63 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +struct zbud_pool; + +struct zbud_ops { + int (*evict)(struct zbud_pool *pool, unsigned long handle); +}; + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * @zpool: zpool driver + * @zpool_ops: zpool operations structure with an evict callback + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + union { + /* + * Reuse unbuddied[0] as buddied on the ground that + * unbuddied[0] is unused. + */ + struct list_head buddied; + struct list_head unbuddied[NCHUNKS]; + }; + struct list_head lru; + u64 pages_nr; + const struct zbud_ops *ops; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = false; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kzalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +static void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retries: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + * + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +static void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +static void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +static u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +/***************** + * zpool + ****************/ + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .sleep_mapped = true, + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zbud"); + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + + zpool_register_driver(&zbud_zpool_driver); + + return 0; +} + +static void __exit exit_zbud(void) +{ + zpool_unregister_driver(&zbud_zpool_driver); + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000..68facc193 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + struct zpool_driver *driver; + void *pool; + const struct zpool_ops *ops; + bool evictable; + bool can_sleep_mapped; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_has_pool() - Check if the pool driver is available + * @type: The type of the zpool to check (e.g. zbud, zsmalloc) + * + * This checks if the @type pool driver is available. This will try to load + * the requested module, if needed, but there is no guarantee the module will + * still be loaded and available immediately after calling. If this returns + * true, the caller should assume the pool is available, but must be prepared + * to handle the @zpool_create_pool() returning failure. However if this + * returns false, the caller should assume the requested pool type is not + * available; either the requested pool type module does not exist, or could + * not be loaded, and calling @zpool_create_pool() with the pool type will + * fail. + * + * The @type string must be null-terminated. + * + * Returns: true if @type pool is available, false if not + */ +bool zpool_has_pool(char *type) +{ + struct zpool_driver *driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) + return false; + + zpool_put_driver(driver); + return true; +} +EXPORT_SYMBOL(zpool_has_pool); + +/** + * zpool_create_pool() - Create a new zpool + * @type: The type of the zpool to create (e.g. zbud, zsmalloc) + * @name: The name of the zpool (e.g. zram0, zswap) + * @gfp: The GFP flags to use when allocating the pool. + * @ops: The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be evictable. + * + * Implementations must guarantee this to be thread-safe. + * + * The @type and @name strings must be null-terminated. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, + const struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_debug("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->driver = driver; + zpool->pool = driver->create(name, gfp, ops, zpool); + zpool->ops = ops; + zpool->evictable = driver->shrink && ops && ops->evict; + zpool->can_sleep_mapped = driver->sleep_mapped; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_debug("created pool type %s\n", type); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @zpool: The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_debug("destroying pool type %s\n", zpool->driver->type); + + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @zpool: The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +const char *zpool_get_type(struct zpool *zpool) +{ + return zpool->driver->type; +} + +/** + * zpool_malloc_support_movable() - Check if the zpool supports + * allocating movable memory + * @zpool: The zpool to check + * + * This returns if the zpool supports allocating movable memory. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: true if the zpool supports allocating movable memory, false if not + */ +bool zpool_malloc_support_movable(struct zpool *zpool) +{ + return zpool->driver->malloc_support_movable; +} + +/** + * zpool_malloc() - Allocate memory + * @zpool: The zpool to allocate from. + * @size: The amount of memory to allocate. + * @gfp: The GFP flags to use when allocating memory. + * @handle: Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @zpool: The zpool that allocated the memory. + * @handle: The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @zpool: The zpool to shrink. + * @pages: The number of pages to shrink the pool. + * @reclaimed: The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink ? + zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to map + * @mapmode: How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mapmode + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operations on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @zpool: The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +/** + * zpool_evictable() - Test if zpool is potentially evictable + * @zpool: The zpool to test + * + * Zpool is only potentially evictable when it's created with struct + * zpool_ops.evict and its driver implements struct zpool_driver.shrink. + * + * However, it doesn't necessarily mean driver will use zpool_ops.evict + * in its implementation of zpool_driver.shrink. It could do internal + * defragmentation instead. + * + * Returns: true if potentially evictable; false otherwise. + */ +bool zpool_evictable(struct zpool *zpool) +{ + return zpool->evictable; +} + +/** + * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. + * @zpool: The zpool to test + * + * Returns: true if zpool can sleep; false otherwise. + */ +bool zpool_can_sleep_mapped(struct zpool *zpool) +{ + return zpool->can_sleep_mapped; +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c new file mode 100644 index 000000000..37f755c9a --- /dev/null +++ b/mm/zsmalloc.c @@ -0,0 +1,2373 @@ +/* + * zsmalloc memory allocator + * + * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the license that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + */ + +/* + * Following is how we use various fields and flags of underlying + * struct page(s) to form a zspage. + * + * Usage of struct page fields: + * page->private: points to zspage + * page->index: links together all component pages of a zspage + * For the huge page, this is always 0, so we use this field + * to store handle. + * page->page_type: first object offset in a subpage of zspage + * + * Usage of struct page flags: + * PG_private: identifies the first component page + * PG_owner_priv_1: identifies the huge component page + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +/* + * lock ordering: + * page_lock + * pool->lock + * zspage->lock + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ZSPAGE_MAGIC 0x58 + +/* + * This must be power of 2 and greater than or equal to sizeof(link_free). + * These two conditions ensure that any 'struct link_free' itself doesn't + * span more than 1 page which avoids complex case of mapping 2 pages simply + * to restore link_free pointer values. + */ +#define ZS_ALIGN 8 + +/* + * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) + * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. + */ +#define ZS_MAX_ZSPAGE_ORDER 2 +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) + +#define ZS_HANDLE_SIZE (sizeof(unsigned long)) + +/* + * Object location (, ) is encoded as + * a single (unsigned long) handle value. + * + * Note that object index starts from 0. + * + * This is made more complicated by various memory models and PAE. + */ + +#ifndef MAX_POSSIBLE_PHYSMEM_BITS +#ifdef MAX_PHYSMEM_BITS +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS +#else +/* + * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just + * be PAGE_SHIFT + */ +#define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG +#endif +#endif + +#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) +#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) + +#define HUGE_BITS 1 +#define FULLNESS_BITS 2 +#define CLASS_BITS 8 +#define ISOLATED_BITS 3 +#define MAGIC_VAL_BITS 8 + +#define MAX(a, b) ((a) >= (b) ? (a) : (b)) +/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ +#define ZS_MIN_ALLOC_SIZE \ + MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) +/* each chunk includes extra space to keep handle */ +#define ZS_MAX_ALLOC_SIZE PAGE_SIZE + +/* + * On systems with 4K page size, this gives 255 size classes! There is a + * trader-off here: + * - Large number of size classes is potentially wasteful as free page are + * spread across these classes + * - Small number of size classes causes large internal fragmentation + * - Probably its better to use specific size classes (empirically + * determined). NOTE: all those class sizes must be set as multiple of + * ZS_ALIGN to make sure link_free itself never has to span 2 pages. + * + * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN + * (reason above) + */ +#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) +#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ + ZS_SIZE_CLASS_DELTA) + 1) + +enum fullness_group { + ZS_EMPTY, + ZS_ALMOST_EMPTY, + ZS_ALMOST_FULL, + ZS_FULL, + NR_ZS_FULLNESS, +}; + +enum class_stat_type { + CLASS_EMPTY, + CLASS_ALMOST_EMPTY, + CLASS_ALMOST_FULL, + CLASS_FULL, + OBJ_ALLOCATED, + OBJ_USED, + NR_ZS_STAT_TYPE, +}; + +struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; +}; + +#ifdef CONFIG_ZSMALLOC_STAT +static struct dentry *zs_stat_root; +#endif + +/* + * We assign a page to ZS_ALMOST_EMPTY fullness group when: + * n <= N / f, where + * n = number of allocated objects + * N = total number of objects zspage can store + * f = fullness_threshold_frac + * + * Similarly, we assign zspage to: + * ZS_ALMOST_FULL when n > N / f + * ZS_EMPTY when n == 0 + * ZS_FULL when n == N + * + * (see: fix_fullness_group()) + */ +static const int fullness_threshold_frac = 4; +static size_t huge_class_size; + +struct size_class { + struct list_head fullness_list[NR_ZS_FULLNESS]; + /* + * Size of objects stored in this class. Must be multiple + * of ZS_ALIGN. + */ + int size; + int objs_per_zspage; + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; + + unsigned int index; + struct zs_size_stat stats; +}; + +/* + * Placed within free objects to form a singly linked list. + * For every zspage, zspage->freeobj gives head of this list. + * + * This must be power of 2 and less than or equal to ZS_ALIGN + */ +struct link_free { + union { + /* + * Free object index; + * It's valid for non-allocated object + */ + unsigned long next; + /* + * Handle of allocated object. + */ + unsigned long handle; + }; +}; + +struct zs_pool { + const char *name; + + struct size_class *size_class[ZS_SIZE_CLASSES]; + struct kmem_cache *handle_cachep; + struct kmem_cache *zspage_cachep; + + atomic_long_t pages_allocated; + + struct zs_pool_stats stats; + + /* Compact classes */ + struct shrinker shrinker; + +#ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; +#endif +#ifdef CONFIG_COMPACTION + struct work_struct free_work; +#endif + spinlock_t lock; + atomic_t compaction_in_progress; +}; + +struct zspage { + struct { + unsigned int huge:HUGE_BITS; + unsigned int fullness:FULLNESS_BITS; + unsigned int class:CLASS_BITS + 1; + unsigned int isolated:ISOLATED_BITS; + unsigned int magic:MAGIC_VAL_BITS; + }; + unsigned int inuse; + unsigned int freeobj; + struct page *first_page; + struct list_head list; /* fullness list */ + struct zs_pool *pool; +#ifdef CONFIG_COMPACTION + rwlock_t lock; +#endif +}; + +struct mapping_area { + local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +}; + +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetZsHugePage(struct zspage *zspage) +{ + zspage->huge = 1; +} + +static bool ZsHugePage(struct zspage *zspage) +{ + return zspage->huge; +} + +#ifdef CONFIG_COMPACTION +static void migrate_lock_init(struct zspage *zspage); +static void migrate_read_lock(struct zspage *zspage); +static void migrate_read_unlock(struct zspage *zspage); +static void migrate_write_lock(struct zspage *zspage); +static void migrate_write_lock_nested(struct zspage *zspage); +static void migrate_write_unlock(struct zspage *zspage); +static void kick_deferred_free(struct zs_pool *pool); +static void init_deferred_free(struct zs_pool *pool); +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); +#else +static void migrate_lock_init(struct zspage *zspage) {} +static void migrate_read_lock(struct zspage *zspage) {} +static void migrate_read_unlock(struct zspage *zspage) {} +static void migrate_write_lock(struct zspage *zspage) {} +static void migrate_write_lock_nested(struct zspage *zspage) {} +static void migrate_write_unlock(struct zspage *zspage) {} +static void kick_deferred_free(struct zs_pool *pool) {} +static void init_deferred_free(struct zs_pool *pool) {} +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} +#endif + +static int create_cache(struct zs_pool *pool) +{ + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, + 0, 0, NULL); + if (!pool->handle_cachep) + return 1; + + pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), + 0, 0, NULL); + if (!pool->zspage_cachep) { + kmem_cache_destroy(pool->handle_cachep); + pool->handle_cachep = NULL; + return 1; + } + + return 0; +} + +static void destroy_cache(struct zs_pool *pool) +{ + kmem_cache_destroy(pool->handle_cachep); + kmem_cache_destroy(pool->zspage_cachep); +} + +static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) +{ + return (unsigned long)kmem_cache_alloc(pool->handle_cachep, + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +} + +static void cache_free_handle(struct zs_pool *pool, unsigned long handle) +{ + kmem_cache_free(pool->handle_cachep, (void *)handle); +} + +static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) +{ + return kmem_cache_zalloc(pool->zspage_cachep, + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); +} + +static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) +{ + kmem_cache_free(pool->zspage_cachep, zspage); +} + +/* pool->lock(which owns the handle) synchronizes races */ +static void record_obj(unsigned long handle, unsigned long obj) +{ + *(unsigned long *)handle = obj; +} + +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size, gfp); + + if (IS_ERR((void *)(*handle))) + return PTR_ERR((void *)*handle); + return 0; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_pages(pool) << PAGE_SHIFT; +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc_support_movable = true, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +MODULE_ALIAS("zpool-zsmalloc"); +#endif /* CONFIG_ZPOOL */ + +/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; + +static __maybe_unused int is_first_page(struct page *page) +{ + return PagePrivate(page); +} + +/* Protected by pool->lock */ +static inline int get_zspage_inuse(struct zspage *zspage) +{ + return zspage->inuse; +} + + +static inline void mod_zspage_inuse(struct zspage *zspage, int val) +{ + zspage->inuse += val; +} + +static inline struct page *get_first_page(struct zspage *zspage) +{ + struct page *first_page = zspage->first_page; + + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + return first_page; +} + +static inline unsigned int get_first_obj_offset(struct page *page) +{ + return page->page_type; +} + +static inline void set_first_obj_offset(struct page *page, unsigned int offset) +{ + page->page_type = offset; +} + +static inline unsigned int get_freeobj(struct zspage *zspage) +{ + return zspage->freeobj; +} + +static inline void set_freeobj(struct zspage *zspage, unsigned int obj) +{ + zspage->freeobj = obj; +} + +static void get_zspage_mapping(struct zspage *zspage, + unsigned int *class_idx, + enum fullness_group *fullness) +{ + BUG_ON(zspage->magic != ZSPAGE_MAGIC); + + *fullness = zspage->fullness; + *class_idx = zspage->class; +} + +static struct size_class *zspage_class(struct zs_pool *pool, + struct zspage *zspage) +{ + return pool->size_class[zspage->class]; +} + +static void set_zspage_mapping(struct zspage *zspage, + unsigned int class_idx, + enum fullness_group fullness) +{ + zspage->class = class_idx; + zspage->fullness = fullness; +} + +/* + * zsmalloc divides the pool into various size classes where each + * class maintains a list of zspages where each zspage is divided + * into equal sized chunks. Each allocation falls into one of these + * classes depending on its size. This function returns index of the + * size class which has chunk size big enough to hold the given size. + */ +static int get_size_class_index(int size) +{ + int idx = 0; + + if (likely(size > ZS_MIN_ALLOC_SIZE)) + idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, + ZS_SIZE_CLASS_DELTA); + + return min_t(int, ZS_SIZE_CLASSES - 1, idx); +} + +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_inc(struct size_class *class, + int type, unsigned long cnt) +{ + class->stats.objs[type] += cnt; +} + +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_dec(struct size_class *class, + int type, unsigned long cnt) +{ + class->stats.objs[type] -= cnt; +} + +/* type can be of enum type class_stat_type or fullness_group */ +static inline unsigned long zs_stat_get(struct size_class *class, + int type) +{ + return class->stats.objs[type]; +} + +#ifdef CONFIG_ZSMALLOC_STAT + +static void __init zs_stat_init(void) +{ + if (!debugfs_initialized()) { + pr_warn("debugfs not available, stat dir not created\n"); + return; + } + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static unsigned long zs_can_compact(struct size_class *class); + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long class_almost_full, class_almost_empty; + unsigned long obj_allocated, obj_used, pages_used, freeable; + unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; + + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", + "class", "size", "almost_full", "almost_empty", + "obj_allocated", "obj_used", "pages_used", + "pages_per_zspage", "freeable"); + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&pool->lock); + class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); + class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); + spin_unlock(&pool->lock); + + objs_per_zspage = class->objs_per_zspage; + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", + i, class->size, class_almost_full, class_almost_empty, + obj_allocated, obj_used, pages_used, + class->pages_per_zspage, freeable); + + total_class_almost_full += class_almost_full; + total_class_almost_empty += class_almost_empty; + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + total_freeable += freeable; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", + "Total", "", total_class_almost_full, + total_class_almost_empty, total_objs, + total_used_objs, total_pages, "", total_freeable); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(zs_stats_size); + +static void zs_pool_stat_create(struct zs_pool *pool, const char *name) +{ + if (!zs_stat_root) { + pr_warn("no root stat dir, not creating <%s> stat dir\n", name); + return; + } + + pool->stat_dentry = debugfs_create_dir(name, zs_stat_root); + + debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool, + &zs_stats_size_fops); +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ +static void __init zs_stat_init(void) +{ +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) +{ +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ +} +#endif + + +/* + * For each size class, zspages are divided into different groups + * depending on how "full" they are. This was done so that we could + * easily find empty or nearly empty zspages when we try to shrink + * the pool (not yet implemented). This function returns fullness + * status of the given page. + */ +static enum fullness_group get_fullness_group(struct size_class *class, + struct zspage *zspage) +{ + int inuse, objs_per_zspage; + enum fullness_group fg; + + inuse = get_zspage_inuse(zspage); + objs_per_zspage = class->objs_per_zspage; + + if (inuse == 0) + fg = ZS_EMPTY; + else if (inuse == objs_per_zspage) + fg = ZS_FULL; + else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac) + fg = ZS_ALMOST_EMPTY; + else + fg = ZS_ALMOST_FULL; + + return fg; +} + +/* + * Each size class maintains various freelists and zspages are assigned + * to one of these freelists based on the number of live objects they + * have. This functions inserts the given zspage into the freelist + * identified by . + */ +static void insert_zspage(struct size_class *class, + struct zspage *zspage, + enum fullness_group fullness) +{ + struct zspage *head; + + class_stat_inc(class, fullness, 1); + head = list_first_entry_or_null(&class->fullness_list[fullness], + struct zspage, list); + /* + * We want to see more ZS_FULL pages and less almost empty/full. + * Put pages with higher ->inuse first. + */ + if (head && get_zspage_inuse(zspage) < get_zspage_inuse(head)) + list_add(&zspage->list, &head->list); + else + list_add(&zspage->list, &class->fullness_list[fullness]); +} + +/* + * This function removes the given zspage from the freelist identified + * by . + */ +static void remove_zspage(struct size_class *class, + struct zspage *zspage, + enum fullness_group fullness) +{ + VM_BUG_ON(list_empty(&class->fullness_list[fullness])); + + list_del_init(&zspage->list); + class_stat_dec(class, fullness, 1); +} + +/* + * Each size class maintains zspages in different fullness groups depending + * on the number of live objects they contain. When allocating or freeing + * objects, the fullness status of the page can change, say, from ALMOST_FULL + * to ALMOST_EMPTY when freeing an object. This function checks if such + * a status change has occurred for the given page and accordingly moves the + * page from the freelist of the old fullness group to that of the new + * fullness group. + */ +static enum fullness_group fix_fullness_group(struct size_class *class, + struct zspage *zspage) +{ + int class_idx; + enum fullness_group currfg, newfg; + + get_zspage_mapping(zspage, &class_idx, &currfg); + newfg = get_fullness_group(class, zspage); + if (newfg == currfg) + goto out; + + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class_idx, newfg); +out: + return newfg; +} + +/* + * We have to decide on how many pages to link together + * to form a zspage for each size class. This is important + * to reduce wastage due to unusable space left at end of + * each zspage which is given as: + * wastage = Zp % class_size + * usage = Zp - wastage + * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... + * + * For example, for size class of 3/8 * PAGE_SIZE, we should + * link together 3 PAGE_SIZE sized pages to form a zspage + * since then we can perfectly fit in 8 such objects. + */ +static int get_pages_per_zspage(int class_size) +{ + int i, max_usedpc = 0; + /* zspage order which gives maximum used size per KB */ + int max_usedpc_order = 1; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int zspage_size; + int waste, usedpc; + + zspage_size = i * PAGE_SIZE; + waste = zspage_size % class_size; + usedpc = (zspage_size - waste) * 100 / zspage_size; + + if (usedpc > max_usedpc) { + max_usedpc = usedpc; + max_usedpc_order = i; + } + } + + return max_usedpc_order; +} + +static struct zspage *get_zspage(struct page *page) +{ + struct zspage *zspage = (struct zspage *)page_private(page); + + BUG_ON(zspage->magic != ZSPAGE_MAGIC); + return zspage; +} + +static struct page *get_next_page(struct page *page) +{ + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) + return NULL; + + return (struct page *)page->index; +} + +/** + * obj_to_location - get (, ) from encoded object value + * @obj: the encoded object value + * @page: page object resides in zspage + * @obj_idx: object index + */ +static void obj_to_location(unsigned long obj, struct page **page, + unsigned int *obj_idx) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); +} + +static void obj_to_page(unsigned long obj, struct page **page) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); +} + +/** + * location_to_obj - get obj value encoded from (, ) + * @page: page object resides in zspage + * @obj_idx: object index + */ +static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +{ + unsigned long obj; + + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= obj_idx & OBJ_INDEX_MASK; + obj <<= OBJ_TAG_BITS; + + return obj; +} + +static unsigned long handle_to_obj(unsigned long handle) +{ + return *(unsigned long *)handle; +} + +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) +{ + unsigned long handle; + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) { + VM_BUG_ON_PAGE(!is_first_page(page), page); + handle = page->index; + } else + handle = *(unsigned long *)obj; + + if (!(handle & OBJ_ALLOCATED_TAG)) + return false; + + *phandle = handle & ~OBJ_ALLOCATED_TAG; + return true; +} + +static void reset_page(struct page *page) +{ + __ClearPageMovable(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page_mapcount_reset(page); + page->index = 0; +} + +static int trylock_zspage(struct zspage *zspage) +{ + struct page *cursor, *fail; + + for (cursor = get_first_page(zspage); cursor != NULL; cursor = + get_next_page(cursor)) { + if (!trylock_page(cursor)) { + fail = cursor; + goto unlock; + } + } + + return 1; +unlock: + for (cursor = get_first_page(zspage); cursor != fail; cursor = + get_next_page(cursor)) + unlock_page(cursor); + + return 0; +} + +static void __free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + struct page *page, *next; + enum fullness_group fg; + unsigned int class_idx; + + get_zspage_mapping(zspage, &class_idx, &fg); + + assert_spin_locked(&pool->lock); + + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(fg != ZS_EMPTY); + + next = page = get_first_page(zspage); + do { + VM_BUG_ON_PAGE(!PageLocked(page), page); + next = get_next_page(page); + reset_page(page); + unlock_page(page); + dec_zone_page_state(page, NR_ZSPAGES); + put_page(page); + page = next; + } while (page != NULL); + + cache_free_zspage(pool, zspage); + + class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); +} + +static void free_zspage(struct zs_pool *pool, struct size_class *class, + struct zspage *zspage) +{ + VM_BUG_ON(get_zspage_inuse(zspage)); + VM_BUG_ON(list_empty(&zspage->list)); + + /* + * Since zs_free couldn't be sleepable, this function cannot call + * lock_page. The page locks trylock_zspage got will be released + * by __free_zspage. + */ + if (!trylock_zspage(zspage)) { + kick_deferred_free(pool); + return; + } + + remove_zspage(class, zspage, ZS_EMPTY); + __free_zspage(pool, class, zspage); +} + +/* Initialize a newly allocated zspage */ +static void init_zspage(struct size_class *class, struct zspage *zspage) +{ + unsigned int freeobj = 1; + unsigned long off = 0; + struct page *page = get_first_page(zspage); + + while (page) { + struct page *next_page; + struct link_free *link; + void *vaddr; + + set_first_obj_offset(page, off); + + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); + + while ((off += class->size) < PAGE_SIZE) { + link->next = freeobj++ << OBJ_TAG_BITS; + link += class->size / sizeof(*link); + } + + /* + * We now come to the last (full or partial) object on this + * page, which must point to the first object on the next + * page (if present) + */ + next_page = get_next_page(page); + if (next_page) { + link->next = freeobj++ << OBJ_TAG_BITS; + } else { + /* + * Reset OBJ_TAG_BITS bit to last link to tell + * whether it's allocated object or not. + */ + link->next = -1UL << OBJ_TAG_BITS; + } + kunmap_atomic(vaddr); + page = next_page; + off %= PAGE_SIZE; + } + + set_freeobj(zspage, 0); +} + +static void create_page_chain(struct size_class *class, struct zspage *zspage, + struct page *pages[]) +{ + int i; + struct page *page; + struct page *prev_page = NULL; + int nr_pages = class->pages_per_zspage; + + /* + * Allocate individual pages and link them together as: + * 1. all pages are linked together using page->index + * 2. each sub-page point to zspage using page->private + * + * we set PG_private to identify the first page (i.e. no other sub-page + * has this flag set). + */ + for (i = 0; i < nr_pages; i++) { + page = pages[i]; + set_page_private(page, (unsigned long)zspage); + page->index = 0; + if (i == 0) { + zspage->first_page = page; + SetPagePrivate(page); + if (unlikely(class->objs_per_zspage == 1 && + class->pages_per_zspage == 1)) + SetZsHugePage(zspage); + } else { + prev_page->index = (unsigned long)page; + } + prev_page = page; + } +} + +/* + * Allocate a zspage for the given size class + */ +static struct zspage *alloc_zspage(struct zs_pool *pool, + struct size_class *class, + gfp_t gfp) +{ + int i; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; + struct zspage *zspage = cache_alloc_zspage(pool, gfp); + + if (!zspage) + return NULL; + + zspage->magic = ZSPAGE_MAGIC; + migrate_lock_init(zspage); + + for (i = 0; i < class->pages_per_zspage; i++) { + struct page *page; + + page = alloc_page(gfp); + if (!page) { + while (--i >= 0) { + dec_zone_page_state(pages[i], NR_ZSPAGES); + __free_page(pages[i]); + } + cache_free_zspage(pool, zspage); + return NULL; + } + + inc_zone_page_state(page, NR_ZSPAGES); + pages[i] = page; + } + + create_page_chain(class, zspage, pages); + init_zspage(class, zspage); + zspage->pool = pool; + + return zspage; +} + +static struct zspage *find_get_zspage(struct size_class *class) +{ + int i; + struct zspage *zspage; + + for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { + zspage = list_first_entry_or_null(&class->fullness_list[i], + struct zspage, list); + if (zspage) + break; + } + + return zspage; +} + +static inline int __zs_cpu_up(struct mapping_area *area) +{ + /* + * Make sure we don't leak memory if a cpu UP notification + * and zs_init() race and both call zs_cpu_up() on the same cpu + */ + if (area->vm_buf) + return 0; + area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); + if (!area->vm_buf) + return -ENOMEM; + return 0; +} + +static inline void __zs_cpu_down(struct mapping_area *area) +{ + kfree(area->vm_buf); + area->vm_buf = NULL; +} + +static void *__zs_map_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf = area->vm_buf; + + /* disable page faults to match kmap_atomic() return conditions */ + pagefault_disable(); + + /* no read fastpath */ + if (area->vm_mm == ZS_MM_WO) + goto out; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy object to per-cpu buffer */ + addr = kmap_atomic(pages[0]); + memcpy(buf, addr + off, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(buf + sizes[0], addr, sizes[1]); + kunmap_atomic(addr); +out: + return area->vm_buf; +} + +static void __zs_unmap_object(struct mapping_area *area, + struct page *pages[2], int off, int size) +{ + int sizes[2]; + void *addr; + char *buf; + + /* no write fastpath */ + if (area->vm_mm == ZS_MM_RO) + goto out; + + buf = area->vm_buf; + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; + + sizes[0] = PAGE_SIZE - off; + sizes[1] = size - sizes[0]; + + /* copy per-cpu buffer to object */ + addr = kmap_atomic(pages[0]); + memcpy(addr + off, buf, sizes[0]); + kunmap_atomic(addr); + addr = kmap_atomic(pages[1]); + memcpy(addr, buf + sizes[0], sizes[1]); + kunmap_atomic(addr); + +out: + /* enable page faults to match kunmap_atomic() return conditions */ + pagefault_enable(); +} + +static int zs_cpu_prepare(unsigned int cpu) +{ + struct mapping_area *area; + + area = &per_cpu(zs_map_area, cpu); + return __zs_cpu_up(area); +} + +static int zs_cpu_dead(unsigned int cpu) +{ + struct mapping_area *area; + + area = &per_cpu(zs_map_area, cpu); + __zs_cpu_down(area); + return 0; +} + +static bool can_merge(struct size_class *prev, int pages_per_zspage, + int objs_per_zspage) +{ + if (prev->pages_per_zspage == pages_per_zspage && + prev->objs_per_zspage == objs_per_zspage) + return true; + + return false; +} + +static bool zspage_full(struct size_class *class, struct zspage *zspage) +{ + return get_zspage_inuse(zspage) == class->objs_per_zspage; +} + +unsigned long zs_get_total_pages(struct zs_pool *pool) +{ + return atomic_long_read(&pool->pages_allocated); +} +EXPORT_SYMBOL_GPL(zs_get_total_pages); + +/** + * zs_map_object - get address of allocated object from handle. + * @pool: pool from which the object was allocated + * @handle: handle returned from zs_malloc + * @mm: mapping mode to use + * + * Before using an object allocated from zs_malloc, it must be mapped using + * this function. When done with the object, it must be unmapped using + * zs_unmap_object. + * + * Only one object can be mapped per cpu at a time. There is no protection + * against nested mappings. + * + * This function returns with preemption and page faults disabled. + */ +void *zs_map_object(struct zs_pool *pool, unsigned long handle, + enum zs_mapmode mm) +{ + struct zspage *zspage; + struct page *page; + unsigned long obj, off; + unsigned int obj_idx; + + struct size_class *class; + struct mapping_area *area; + struct page *pages[2]; + void *ret; + + /* + * Because we use per-cpu mapping areas shared among the + * pools/users, we can't allow mapping in interrupt context + * because it can corrupt another users mappings. + */ + BUG_ON(in_interrupt()); + + /* It guarantees it can get zspage from handle safely */ + spin_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); + + /* + * migration cannot move any zpages in this zspage. Here, pool->lock + * is too heavy since callers would take some time until they calls + * zs_unmap_object API so delegate the locking from class to zspage + * which is smaller granularity. + */ + migrate_read_lock(zspage); + spin_unlock(&pool->lock); + + class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ + area->vm_addr = kmap_atomic(page); + ret = area->vm_addr + off; + goto out; + } + + /* this object spans two pages */ + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + ret = __zs_map_object(area, pages, off, class->size); +out: + if (likely(!ZsHugePage(zspage))) + ret += ZS_HANDLE_SIZE; + + return ret; +} +EXPORT_SYMBOL_GPL(zs_map_object); + +void zs_unmap_object(struct zs_pool *pool, unsigned long handle) +{ + struct zspage *zspage; + struct page *page; + unsigned long obj, off; + unsigned int obj_idx; + + struct size_class *class; + struct mapping_area *area; + + obj = handle_to_obj(handle); + obj_to_location(obj, &page, &obj_idx); + zspage = get_zspage(page); + class = zspage_class(pool, zspage); + off = (class->size * obj_idx) & ~PAGE_MASK; + + area = this_cpu_ptr(&zs_map_area); + if (off + class->size <= PAGE_SIZE) + kunmap_atomic(area->vm_addr); + else { + struct page *pages[2]; + + pages[0] = page; + pages[1] = get_next_page(page); + BUG_ON(!pages[1]); + + __zs_unmap_object(area, pages, off, class->size); + } + local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); +} +EXPORT_SYMBOL_GPL(zs_unmap_object); + +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + +static unsigned long obj_malloc(struct zs_pool *pool, + struct zspage *zspage, unsigned long handle) +{ + int i, nr_page, offset; + unsigned long obj; + struct link_free *link; + struct size_class *class; + + struct page *m_page; + unsigned long m_offset; + void *vaddr; + + class = pool->size_class[zspage->class]; + handle |= OBJ_ALLOCATED_TAG; + obj = get_freeobj(zspage); + + offset = obj * class->size; + nr_page = offset >> PAGE_SHIFT; + m_offset = offset & ~PAGE_MASK; + m_page = get_first_page(zspage); + + for (i = 0; i < nr_page; i++) + m_page = get_next_page(m_page); + + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); + set_freeobj(zspage, link->next >> OBJ_TAG_BITS); + if (likely(!ZsHugePage(zspage))) + /* record handle in the header of allocated chunk */ + link->handle = handle; + else + /* record handle to page->index */ + zspage->first_page->index = handle; + + kunmap_atomic(vaddr); + mod_zspage_inuse(zspage, 1); + + obj = location_to_obj(m_page, obj); + + return obj; +} + + +/** + * zs_malloc - Allocate block of given size from pool. + * @pool: pool to allocate from + * @size: size of block to allocate + * @gfp: gfp flags when allocating object + * + * On success, handle to the allocated object is returned, + * otherwise an ERR_PTR(). + * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. + */ +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +{ + unsigned long handle, obj; + struct size_class *class; + enum fullness_group newfg; + struct zspage *zspage; + + if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) + return (unsigned long)ERR_PTR(-EINVAL); + + handle = cache_alloc_handle(pool, gfp); + if (!handle) + return (unsigned long)ERR_PTR(-ENOMEM); + + /* extra space in chunk to keep the handle */ + size += ZS_HANDLE_SIZE; + class = pool->size_class[get_size_class_index(size)]; + + /* pool->lock effectively protects the zpage migration */ + spin_lock(&pool->lock); + zspage = find_get_zspage(class); + if (likely(zspage)) { + obj = obj_malloc(pool, zspage, handle); + /* Now move the zspage to another fullness group, if required */ + fix_fullness_group(class, zspage); + record_obj(handle, obj); + class_stat_inc(class, OBJ_USED, 1); + spin_unlock(&pool->lock); + + return handle; + } + + spin_unlock(&pool->lock); + + zspage = alloc_zspage(pool, class, gfp); + if (!zspage) { + cache_free_handle(pool, handle); + return (unsigned long)ERR_PTR(-ENOMEM); + } + + spin_lock(&pool->lock); + obj = obj_malloc(pool, zspage, handle); + newfg = get_fullness_group(class, zspage); + insert_zspage(class, zspage, newfg); + set_zspage_mapping(zspage, class->index, newfg); + record_obj(handle, obj); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); + class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_USED, 1); + + /* We completely set up zspage so mark them as movable */ + SetZsPageMovable(pool, zspage); + spin_unlock(&pool->lock); + + return handle; +} +EXPORT_SYMBOL_GPL(zs_malloc); + +static void obj_free(int class_size, unsigned long obj) +{ + struct link_free *link; + struct zspage *zspage; + struct page *f_page; + unsigned long f_offset; + unsigned int f_objidx; + void *vaddr; + + obj_to_location(obj, &f_page, &f_objidx); + f_offset = (class_size * f_objidx) & ~PAGE_MASK; + zspage = get_zspage(f_page); + + vaddr = kmap_atomic(f_page); + + /* Insert this object in containing zspage's freelist */ + link = (struct link_free *)(vaddr + f_offset); + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; + kunmap_atomic(vaddr); + set_freeobj(zspage, f_objidx); + mod_zspage_inuse(zspage, -1); +} + +void zs_free(struct zs_pool *pool, unsigned long handle) +{ + struct zspage *zspage; + struct page *f_page; + unsigned long obj; + struct size_class *class; + enum fullness_group fullness; + + if (IS_ERR_OR_NULL((void *)handle)) + return; + + /* + * The pool->lock protects the race with zpage's migration + * so it's safe to get the page from handle. + */ + spin_lock(&pool->lock); + obj = handle_to_obj(handle); + obj_to_page(obj, &f_page); + zspage = get_zspage(f_page); + class = zspage_class(pool, zspage); + + obj_free(class->size, obj); + class_stat_dec(class, OBJ_USED, 1); + fullness = fix_fullness_group(class, zspage); + if (fullness != ZS_EMPTY) + goto out; + + free_zspage(pool, class, zspage); +out: + spin_unlock(&pool->lock); + cache_free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(struct size_class *class, unsigned long dst, + unsigned long src) +{ + struct page *s_page, *d_page; + unsigned int s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = (class->size * s_objidx) & ~PAGE_MASK; + d_off = (class->size * d_objidx) & ~PAGE_MASK; + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + s_off += size; + s_size -= size; + d_off += size; + d_size -= size; + + /* + * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() + * calls must occurs in reverse order of calls to kmap_atomic(). + * So, to call kunmap_atomic(s_addr) we should first call + * kunmap_atomic(d_addr). For more details see + * Documentation/mm/highmem.rst. + */ + if (s_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } + + if (d_off >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) +{ + unsigned int offset; + int index = *obj_idx; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + offset = get_first_obj_offset(page); + offset += class->size * index; + + while (offset < PAGE_SIZE) { + if (obj_allocated(page, addr + offset, &handle)) + break; + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + + *obj_idx = index; + + return handle; +} + +struct zs_compact_control { + /* Source spage for migration which could be a subpage of zspage */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int obj_idx; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + int obj_idx = cc->obj_idx; + int ret = 0; + + while (1) { + handle = find_alloced_obj(class, s_page, &obj_idx); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + obj_idx = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(class, get_zspage(d_page))) { + ret = -ENOMEM; + break; + } + + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(pool, get_zspage(d_page), handle); + zs_object_copy(class, free_obj, used_obj); + obj_idx++; + record_obj(handle, free_obj); + obj_free(class->size, used_obj); + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->obj_idx = obj_idx; + + return ret; +} + +static struct zspage *isolate_zspage(struct size_class *class, bool source) +{ + int i; + struct zspage *zspage; + enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL}; + + if (!source) { + fg[0] = ZS_ALMOST_FULL; + fg[1] = ZS_ALMOST_EMPTY; + } + + for (i = 0; i < 2; i++) { + zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], + struct zspage, list); + if (zspage) { + remove_zspage(class, zspage, fg[i]); + return zspage; + } + } + + return zspage; +} + +/* + * putback_zspage - add @zspage into right class's fullness list + * @class: destination class + * @zspage: target page + * + * Return @zspage's fullness_group + */ +static enum fullness_group putback_zspage(struct size_class *class, + struct zspage *zspage) +{ + enum fullness_group fullness; + + fullness = get_fullness_group(class, zspage); + insert_zspage(class, zspage, fullness); + set_zspage_mapping(zspage, class->index, fullness); + + return fullness; +} + +#ifdef CONFIG_COMPACTION +/* + * To prevent zspage destroy during migration, zspage freeing should + * hold locks of all pages in the zspage. + */ +static void lock_zspage(struct zspage *zspage) +{ + struct page *curr_page, *page; + + /* + * Pages we haven't locked yet can be migrated off the list while we're + * trying to lock them, so we need to be careful and only attempt to + * lock each page under migrate_read_lock(). Otherwise, the page we lock + * may no longer belong to the zspage. This means that we may wait for + * the wrong page to unlock, so we must take a reference to the page + * prior to waiting for it to unlock outside migrate_read_lock(). + */ + while (1) { + migrate_read_lock(zspage); + page = get_first_page(zspage); + if (trylock_page(page)) + break; + get_page(page); + migrate_read_unlock(zspage); + wait_on_page_locked(page); + put_page(page); + } + + curr_page = page; + while ((page = get_next_page(curr_page))) { + if (trylock_page(page)) { + curr_page = page; + } else { + get_page(page); + migrate_read_unlock(zspage); + wait_on_page_locked(page); + put_page(page); + migrate_read_lock(zspage); + } + } + migrate_read_unlock(zspage); +} + +static void migrate_lock_init(struct zspage *zspage) +{ + rwlock_init(&zspage->lock); +} + +static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) +{ + read_lock(&zspage->lock); +} + +static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) +{ + read_unlock(&zspage->lock); +} + +static void migrate_write_lock(struct zspage *zspage) +{ + write_lock(&zspage->lock); +} + +static void migrate_write_lock_nested(struct zspage *zspage) +{ + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} + +static void migrate_write_unlock(struct zspage *zspage) +{ + write_unlock(&zspage->lock); +} + +/* Number of isolated subpage for *page migration* in this zspage */ +static void inc_zspage_isolation(struct zspage *zspage) +{ + zspage->isolated++; +} + +static void dec_zspage_isolation(struct zspage *zspage) +{ + VM_BUG_ON(zspage->isolated == 0); + zspage->isolated--; +} + +static const struct movable_operations zsmalloc_mops; + +static void replace_sub_page(struct size_class *class, struct zspage *zspage, + struct page *newpage, struct page *oldpage) +{ + struct page *page; + struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; + int idx = 0; + + page = get_first_page(zspage); + do { + if (page == oldpage) + pages[idx] = newpage; + else + pages[idx] = page; + idx++; + } while ((page = get_next_page(page)) != NULL); + + create_page_chain(class, zspage, pages); + set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); + if (unlikely(ZsHugePage(zspage))) + newpage->index = oldpage->index; + __SetPageMovable(newpage, &zsmalloc_mops); +} + +static bool zs_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct zs_pool *pool; + struct zspage *zspage; + + /* + * Page is locked so zspage couldn't be destroyed. For detail, look at + * lock_zspage in free_zspage. + */ + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + zspage = get_zspage(page); + pool = zspage->pool; + spin_lock(&pool->lock); + inc_zspage_isolation(zspage); + spin_unlock(&pool->lock); + + return true; +} + +static int zs_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + struct zs_pool *pool; + struct size_class *class; + struct zspage *zspage; + struct page *dummy; + void *s_addr, *d_addr, *addr; + unsigned int offset; + unsigned long handle; + unsigned long old_obj, new_obj; + unsigned int obj_idx; + + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the zs lock, which does not work with + * MIGRATE_SYNC_NO_COPY workflow. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + /* The page is locked, so this pointer must remain valid */ + zspage = get_zspage(page); + pool = zspage->pool; + + /* + * The pool's lock protects the race between zpage migration + * and zs_free. + */ + spin_lock(&pool->lock); + class = zspage_class(pool, zspage); + + /* the migrate_write_lock protects zpage access via zs_map_object */ + migrate_write_lock(zspage); + + offset = get_first_obj_offset(page); + s_addr = kmap_atomic(page); + + /* + * Here, any user cannot access all objects in the zspage so let's move. + */ + d_addr = kmap_atomic(newpage); + memcpy(d_addr, s_addr, PAGE_SIZE); + kunmap_atomic(d_addr); + + for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; + addr += class->size) { + if (obj_allocated(page, addr, &handle)) { + + old_obj = handle_to_obj(handle); + obj_to_location(old_obj, &dummy, &obj_idx); + new_obj = (unsigned long)location_to_obj(newpage, + obj_idx); + record_obj(handle, new_obj); + } + } + kunmap_atomic(s_addr); + + replace_sub_page(class, zspage, newpage, page); + dec_zspage_isolation(zspage); + /* + * Since we complete the data copy and set up new zspage structure, + * it's okay to release the pool's lock. + */ + spin_unlock(&pool->lock); + migrate_write_unlock(zspage); + + get_page(newpage); + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); + } + + reset_page(page); + put_page(page); + + return MIGRATEPAGE_SUCCESS; +} + +static void zs_page_putback(struct page *page) +{ + struct zs_pool *pool; + struct zspage *zspage; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zspage = get_zspage(page); + pool = zspage->pool; + spin_lock(&pool->lock); + dec_zspage_isolation(zspage); + spin_unlock(&pool->lock); +} + +static const struct movable_operations zsmalloc_mops = { + .isolate_page = zs_page_isolate, + .migrate_page = zs_page_migrate, + .putback_page = zs_page_putback, +}; + +/* + * Caller should hold page_lock of all pages in the zspage + * In here, we cannot use zspage meta data. + */ +static void async_free_zspage(struct work_struct *work) +{ + int i; + struct size_class *class; + unsigned int class_idx; + enum fullness_group fullness; + struct zspage *zspage, *tmp; + LIST_HEAD(free_pages); + struct zs_pool *pool = container_of(work, struct zs_pool, + free_work); + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + class = pool->size_class[i]; + if (class->index != i) + continue; + + spin_lock(&pool->lock); + list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); + spin_unlock(&pool->lock); + } + + list_for_each_entry_safe(zspage, tmp, &free_pages, list) { + list_del(&zspage->list); + lock_zspage(zspage); + + get_zspage_mapping(zspage, &class_idx, &fullness); + VM_BUG_ON(fullness != ZS_EMPTY); + class = pool->size_class[class_idx]; + spin_lock(&pool->lock); + __free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); + } +}; + +static void kick_deferred_free(struct zs_pool *pool) +{ + schedule_work(&pool->free_work); +} + +static void zs_flush_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); +} + +static void init_deferred_free(struct zs_pool *pool) +{ + INIT_WORK(&pool->free_work, async_free_zspage); +} + +static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + WARN_ON(!trylock_page(page)); + __SetPageMovable(page, &zsmalloc_mops); + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); +} +#else +static inline void zs_flush_migration(struct zs_pool *pool) { } +#endif + +/* + * + * Based on the number of unused allocated objects calculate + * and return the number of pages that we can free. + */ +static unsigned long zs_can_compact(struct size_class *class) +{ + unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); + + if (obj_allocated <= obj_used) + return 0; + + obj_wasted = obj_allocated - obj_used; + obj_wasted /= class->objs_per_zspage; + + return obj_wasted * class->pages_per_zspage; +} + +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) +{ + struct zs_compact_control cc; + struct zspage *src_zspage; + struct zspage *dst_zspage = NULL; + unsigned long pages_freed = 0; + + /* + * protect the race between zpage migration and zs_free + * as well as zpage allocation/free + */ + spin_lock(&pool->lock); + while ((src_zspage = isolate_zspage(class, true))) { + /* protect someone accessing the zspage(i.e., zs_map_object) */ + migrate_write_lock(src_zspage); + + if (!zs_can_compact(class)) + break; + + cc.obj_idx = 0; + cc.s_page = get_first_page(src_zspage); + + while ((dst_zspage = isolate_zspage(class, false))) { + migrate_write_lock_nested(dst_zspage); + + cc.d_page = get_first_page(dst_zspage); + /* + * If there is no more space in dst_page, resched + * and see if anyone had allocated another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + dst_zspage = NULL; + if (spin_is_contended(&pool->lock)) + break; + } + + /* Stop if we couldn't find slot */ + if (dst_zspage == NULL) + break; + + putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + migrate_write_unlock(src_zspage); + free_zspage(pool, class, src_zspage); + pages_freed += class->pages_per_zspage; + } else + migrate_write_unlock(src_zspage); + spin_unlock(&pool->lock); + cond_resched(); + spin_lock(&pool->lock); + } + + if (src_zspage) { + putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + } + + spin_unlock(&pool->lock); + + return pages_freed; +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + struct size_class *class; + unsigned long pages_freed = 0; + + /* + * Pool compaction is performed under pool->lock so it is basically + * single-threaded. Having more than one thread in __zs_compact() + * will increase pool->lock contention, which will impact other + * zsmalloc operations that need pool->lock. + */ + if (atomic_xchg(&pool->compaction_in_progress, 1)) + return 0; + + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (class->index != i) + continue; + pages_freed += __zs_compact(pool, class); + } + atomic_long_add(pages_freed, &pool->stats.pages_compacted); + atomic_set(&pool->compaction_in_progress, 0); + + return pages_freed; +} +EXPORT_SYMBOL_GPL(zs_compact); + +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) +{ + memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); +} +EXPORT_SYMBOL_GPL(zs_pool_stats); + +static unsigned long zs_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_freed; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + /* + * Compact classes and calculate compaction delta. + * Can run concurrently with a manually triggered + * (by user) compaction. + */ + pages_freed = zs_compact(pool); + + return pages_freed ? pages_freed : SHRINK_STOP; +} + +static unsigned long zs_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int i; + struct size_class *class; + unsigned long pages_to_free = 0; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (class->index != i) + continue; + + pages_to_free += zs_can_compact(class); + } + + return pages_to_free; +} + +static void zs_unregister_shrinker(struct zs_pool *pool) +{ + unregister_shrinker(&pool->shrinker); +} + +static int zs_register_shrinker(struct zs_pool *pool) +{ + pool->shrinker.scan_objects = zs_shrinker_scan; + pool->shrinker.count_objects = zs_shrinker_count; + pool->shrinker.batch = 0; + pool->shrinker.seeks = DEFAULT_SEEKS; + + return register_shrinker(&pool->shrinker, "mm-zspool:%s", + pool->name); +} + +/** + * zs_create_pool - Creates an allocation pool to work from. + * @name: pool name to be created + * + * This function must be called before anything when using + * the zsmalloc allocator. + * + * On success, a pointer to the newly created pool is returned, + * otherwise NULL. + */ +struct zs_pool *zs_create_pool(const char *name) +{ + int i; + struct zs_pool *pool; + struct size_class *prev_class = NULL; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + init_deferred_free(pool); + spin_lock_init(&pool->lock); + atomic_set(&pool->compaction_in_progress, 0); + + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) + goto err; + + if (create_cache(pool)) + goto err; + + /* + * Iterate reversely, because, size of size_class that we want to use + * for merging should be larger or equal to current size. + */ + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { + int size; + int pages_per_zspage; + int objs_per_zspage; + struct size_class *class; + int fullness = 0; + + size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; + if (size > ZS_MAX_ALLOC_SIZE) + size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); + objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; + + /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } + + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (prev_class) { + if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; + + class->size = size; + class->index = i; + class->pages_per_zspage = pages_per_zspage; + class->objs_per_zspage = objs_per_zspage; + pool->size_class[i] = class; + for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; + fullness++) + INIT_LIST_HEAD(&class->fullness_list[fullness]); + + prev_class = class; + } + + /* debug only, don't abort if it fails */ + zs_pool_stat_create(pool, name); + + /* + * Not critical since shrinker is only used to trigger internal + * defragmentation of the pool which is pretty optional thing. If + * registration fails we still can use the pool normally and user can + * trigger compaction manually. Thus, ignore return code. + */ + zs_register_shrinker(pool); + + return pool; + +err: + zs_destroy_pool(pool); + return NULL; +} +EXPORT_SYMBOL_GPL(zs_create_pool); + +void zs_destroy_pool(struct zs_pool *pool) +{ + int i; + + zs_unregister_shrinker(pool); + zs_flush_migration(pool); + zs_pool_stat_destroy(pool); + + for (i = 0; i < ZS_SIZE_CLASSES; i++) { + int fg; + struct size_class *class = pool->size_class[i]; + + if (!class) + continue; + + if (class->index != i) + continue; + + for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { + if (!list_empty(&class->fullness_list[fg])) { + pr_info("Freeing non-empty class with size %db, fullness group %d\n", + class->size, fg); + } + } + kfree(class); + } + + destroy_cache(pool); + kfree(pool->name); + kfree(pool); +} +EXPORT_SYMBOL_GPL(zs_destroy_pool); + +static int __init zs_init(void) +{ + int ret; + + ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", + zs_cpu_prepare, zs_cpu_dead); + if (ret) + goto out; + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + + zs_stat_init(); + + return 0; + +out: + return ret; +} + +static void __exit zs_exit(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); + + zs_stat_exit(); +} + +module_init(zs_init); +module_exit(zs_exit); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta "); diff --git a/mm/zswap.c b/mm/zswap.c new file mode 100644 index 000000000..b3829ada4 --- /dev/null +++ b/mm/zswap.c @@ -0,0 +1,1565 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * zswap.c - zswap driver file + * + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. + * + * Copyright (C) 2012 Seth Jennings +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "swap.h" + +/********************************* +* statistics +**********************************/ +/* Total bytes used by the compressed storage */ +u64 zswap_pool_total_size; +/* The number of compressed pages currently stored in zswap */ +atomic_t zswap_stored_pages = ATOMIC_INIT(0); +/* The number of same-value filled pages currently stored in zswap */ +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); + +/* + * The statistics below are not protected from concurrent access for + * performance reasons so they may not be a 100% accurate. However, + * they do provide useful information on roughly how many times a + * certain event is occurring. +*/ + +/* Pool limit was hit (see zswap_max_pool_percent) */ +static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ +static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ +static u64 zswap_reject_compress_poor; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ +static u64 zswap_reject_kmemcache_fail; +/* Duplicate store was encountered (rare) */ +static u64 zswap_duplicate_entry; + +/* Shrinker work queue */ +static struct workqueue_struct *shrink_wq; +/* Pool limit was hit, we need to calm down */ +static bool zswap_pool_reached_full; + +/********************************* +* tunables +**********************************/ + +#define ZSWAP_PARAM_UNSET "" + +/* Enable/disable zswap */ +static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); +static int zswap_enabled_param_set(const char *, + const struct kernel_param *); +static const struct kernel_param_ops zswap_enabled_param_ops = { + .set = zswap_enabled_param_set, + .get = param_get_bool, +}; +module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); + +/* Crypto compressor to use */ +static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; +static int zswap_compressor_param_set(const char *, + const struct kernel_param *); +static const struct kernel_param_ops zswap_compressor_param_ops = { + .set = zswap_compressor_param_set, + .get = param_get_charp, + .free = param_free_charp, +}; +module_param_cb(compressor, &zswap_compressor_param_ops, + &zswap_compressor, 0644); + +/* Compressed storage zpool to use */ +static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; +static int zswap_zpool_param_set(const char *, const struct kernel_param *); +static const struct kernel_param_ops zswap_zpool_param_ops = { + .set = zswap_zpool_param_set, + .get = param_get_charp, + .free = param_free_charp, +}; +module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); + +/* The maximum percentage of memory that the compressed pool can occupy */ +static unsigned int zswap_max_pool_percent = 20; +module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); + +/* The threshold for accepting new pages after the max_pool_percent was hit */ +static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ +module_param_named(accept_threshold_percent, zswap_accept_thr_percent, + uint, 0644); + +/* + * Enable/disable handling same-value filled pages (enabled by default). + * If disabled every page is considered non-same-value filled. + */ +static bool zswap_same_filled_pages_enabled = true; +module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, + bool, 0644); + +/* Enable/disable handling non-same-value filled pages (enabled by default) */ +static bool zswap_non_same_filled_pages_enabled = true; +module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled, + bool, 0644); + +/********************************* +* data structures +**********************************/ + +struct crypto_acomp_ctx { + struct crypto_acomp *acomp; + struct acomp_req *req; + struct crypto_wait wait; + u8 *dstmem; + struct mutex *mutex; +}; + +struct zswap_pool { + struct zpool *zpool; + struct crypto_acomp_ctx __percpu *acomp_ctx; + struct kref kref; + struct list_head list; + struct work_struct release_work; + struct work_struct shrink_work; + struct hlist_node node; + char tfm_name[CRYPTO_MAX_ALG_NAME]; +}; + +/* + * struct zswap_entry + * + * This structure contains the metadata for tracking a single compressed + * page within zswap. + * + * rbnode - links the entry into red-black tree for the appropriate swap type + * offset - the swap offset for the entry. Index into the red-black tree. + * refcount - the number of outstanding reference to the entry. This is needed + * to protect against premature freeing of the entry by code + * concurrent calls to load, invalidate, and writeback. The lock + * for the zswap_tree structure that contains the entry must + * be held while changing the refcount. Since the lock must + * be held, there is no reason to also make refcount atomic. + * length - the length in bytes of the compressed page data. Needed during + * decompression. For a same value filled page length is 0. + * pool - the zswap_pool the entry's data is in + * handle - zpool allocation handle that stores the compressed page data + * value - value of the same-value filled pages which have same content + */ +struct zswap_entry { + struct rb_node rbnode; + pgoff_t offset; + int refcount; + unsigned int length; + struct zswap_pool *pool; + union { + unsigned long handle; + unsigned long value; + }; + struct obj_cgroup *objcg; +}; + +struct zswap_header { + swp_entry_t swpentry; +}; + +/* + * The tree lock in the zswap_tree struct protects a few things: + * - the rbtree + * - the refcount field of each entry in the tree + */ +struct zswap_tree { + struct rb_root rbroot; + spinlock_t lock; +}; + +static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; + +/* RCU-protected iteration */ +static LIST_HEAD(zswap_pools); +/* protects zswap_pools list modification */ +static DEFINE_SPINLOCK(zswap_pools_lock); +/* pool counter to provide unique names to zpool */ +static atomic_t zswap_pools_count = ATOMIC_INIT(0); + +/* used by param callback function */ +static bool zswap_init_started; + +/* fatal error during init */ +static bool zswap_init_failed; + +/* init completed, but couldn't create the initial pool */ +static bool zswap_has_pool; + +/********************************* +* helpers and fwd declarations +**********************************/ + +#define zswap_pool_debug(msg, p) \ + pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ + zpool_get_type((p)->zpool)) + +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); +static int zswap_pool_get(struct zswap_pool *pool); +static void zswap_pool_put(struct zswap_pool *pool); + +static const struct zpool_ops zswap_zpool_ops = { + .evict = zswap_writeback_entry +}; + +static bool zswap_is_full(void) +{ + return totalram_pages() * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); +} + +static bool zswap_can_accept(void) +{ + return totalram_pages() * zswap_accept_thr_percent / 100 * + zswap_max_pool_percent / 100 > + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); +} + +static void zswap_update_total_size(void) +{ + struct zswap_pool *pool; + u64 total = 0; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + total += zpool_get_total_size(pool->zpool); + + rcu_read_unlock(); + + zswap_pool_total_size = total; +} + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static int __init zswap_entry_cache_create(void) +{ + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return zswap_entry_cache == NULL; +} + +static void __init zswap_entry_cache_destroy(void) +{ + kmem_cache_destroy(zswap_entry_cache); +} + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc(zswap_entry_cache, gfp); + if (!entry) + return NULL; + entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + if (entry->offset > offset) + node = node->rb_left; + else if (entry->offset < offset) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + if (myentry->offset > entry->offset) + link = &(*link)->rb_left; + else if (myentry->offset < entry->offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} + +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zpool allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_entry *entry) +{ + if (entry->objcg) { + obj_cgroup_uncharge_zswap(entry->objcg, entry->length); + obj_cgroup_put(entry->objcg); + } + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); + } + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_update_total_size(); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + +/********************************* +* per-cpu code +**********************************/ +static DEFINE_PER_CPU(u8 *, zswap_dstmem); +/* + * If users dynamically change the zpool type and compressor at runtime, i.e. + * zswap is running, zswap can have more than one zpool on one cpu, but they + * are sharing dtsmem. So we need this mutex to be per-cpu. + */ +static DEFINE_PER_CPU(struct mutex *, zswap_mutex); + +static int zswap_dstmem_prepare(unsigned int cpu) +{ + struct mutex *mutex; + u8 *dst; + + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + if (!dst) + return -ENOMEM; + + mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); + if (!mutex) { + kfree(dst); + return -ENOMEM; + } + + mutex_init(mutex); + per_cpu(zswap_dstmem, cpu) = dst; + per_cpu(zswap_mutex, cpu) = mutex; + return 0; +} + +static int zswap_dstmem_dead(unsigned int cpu) +{ + struct mutex *mutex; + u8 *dst; + + mutex = per_cpu(zswap_mutex, cpu); + kfree(mutex); + per_cpu(zswap_mutex, cpu) = NULL; + + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; + + return 0; +} + +static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + return PTR_ERR(acomp); + } + acomp_ctx->acomp = acomp; + + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + crypto_free_acomp(acomp_ctx->acomp); + return -ENOMEM; + } + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); + acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); + + return 0; +} + +static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) +{ + struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + } + + return 0; +} + +/********************************* +* pool functions +**********************************/ + +static struct zswap_pool *__zswap_pool_current(void) +{ + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ONCE(!pool && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +static struct zswap_pool *zswap_pool_last_get(void) +{ + struct zswap_pool *pool, *last = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + last = pool; + WARN_ONCE(!last && zswap_has_pool, + "%s: no page storage pool!\n", __func__); + if (!zswap_pool_get(last)) + last = NULL; + + rcu_read_unlock(); + + return last; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + if (strcmp(zpool_get_type(pool->zpool), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; +} + +static void shrink_worker(struct work_struct *w) +{ + struct zswap_pool *pool = container_of(w, typeof(*pool), + shrink_work); + + if (zpool_shrink(pool->zpool, 1, NULL)) + zswap_reject_reclaim_fail++; + zswap_pool_put(pool); +} + +static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +{ + struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + int ret; + + if (!zswap_has_pool) { + /* if either are unset, pool initialization failed, and we + * need both params to be set correctly before trying to + * create a pool. + */ + if (!strcmp(type, ZSWAP_PARAM_UNSET)) + return NULL; + if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; + } + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + + pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); + if (!pool->zpool) { + pr_err("%s zpool not available\n", type); + goto error; + } + pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); + + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { + pr_err("percpu alloc failed\n"); + goto error; + } + + ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE, + &pool->node); + if (ret) + goto error; + pr_debug("using %s compressor\n", pool->tfm_name); + + /* being the current pool takes 1 ref; this func expects the + * caller to always add the new pool as the current pool + */ + kref_init(&pool->kref); + INIT_LIST_HEAD(&pool->list); + INIT_WORK(&pool->shrink_work, shrink_worker); + + zswap_pool_debug("created", pool); + + return pool; + +error: + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); + if (pool->zpool) + zpool_destroy_pool(pool->zpool); + kfree(pool); + return NULL; +} + +static __init struct zswap_pool *__zswap_pool_create_fallback(void) +{ + bool has_comp, has_zpool; + + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("compressor %s not available, using default %s\n", + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); + param_free_charp(&zswap_compressor); + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); + } + if (!has_comp) { + pr_err("default compressor %s not available\n", + zswap_compressor); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_PARAM_UNSET; + } + + has_zpool = zpool_has_pool(zswap_zpool_type); + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { + pr_err("zpool %s not available, using default %s\n", + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; + has_zpool = zpool_has_pool(zswap_zpool_type); + } + if (!has_zpool) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_PARAM_UNSET; + } + + if (!has_comp || !has_zpool) + return NULL; + + return zswap_pool_create(zswap_zpool_type, zswap_compressor); +} + +static void zswap_pool_destroy(struct zswap_pool *pool) +{ + zswap_pool_debug("destroying", pool); + + cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); + free_percpu(pool->acomp_ctx); + zpool_destroy_pool(pool->zpool); + kfree(pool); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + if (!pool) + return 0; + + return kref_get_unless_zero(&pool->kref); +} + +static void __zswap_pool_release(struct work_struct *work) +{ + struct zswap_pool *pool = container_of(work, typeof(*pool), + release_work); + + synchronize_rcu(); + + /* nobody should have been able to get a kref... */ + WARN_ON(kref_get_unless_zero(&pool->kref)); + + /* pool is now off zswap_pools list and has no references. */ + zswap_pool_destroy(pool); +} + +static void __zswap_pool_empty(struct kref *kref) +{ + struct zswap_pool *pool; + + pool = container_of(kref, typeof(*pool), kref); + + spin_lock(&zswap_pools_lock); + + WARN_ON(pool == zswap_pool_current()); + + list_del_rcu(&pool->list); + + INIT_WORK(&pool->release_work, __zswap_pool_release); + schedule_work(&pool->release_work); + + spin_unlock(&zswap_pools_lock); +} + +static void zswap_pool_put(struct zswap_pool *pool) +{ + kref_put(&pool->kref, __zswap_pool_empty); +} + +/********************************* +* param callbacks +**********************************/ + +/* val must be a null-terminated string */ +static int __zswap_param_set(const char *val, const struct kernel_param *kp, + char *type, char *compressor) +{ + struct zswap_pool *pool, *put_pool = NULL; + char *s = strstrip((char *)val); + int ret; + + if (zswap_init_failed) { + pr_err("can't set param, initialization failed\n"); + return -ENODEV; + } + + /* no change required */ + if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) + return 0; + + /* if this is load-time (pre-init) param setting, + * don't create a pool; that's done during init. + */ + if (!zswap_init_started) + return param_set_charp(s, kp); + + if (!type) { + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); + return -ENOENT; + } + type = s; + } else if (!compressor) { + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; + } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; + } + + spin_lock(&zswap_pools_lock); + + pool = zswap_pool_find_get(type, compressor); + if (pool) { + zswap_pool_debug("using existing", pool); + WARN_ON(pool == zswap_pool_current()); + list_del_rcu(&pool->list); + } + + spin_unlock(&zswap_pools_lock); + + if (!pool) + pool = zswap_pool_create(type, compressor); + + if (pool) + ret = param_set_charp(s, kp); + else + ret = -EINVAL; + + spin_lock(&zswap_pools_lock); + + if (!ret) { + put_pool = zswap_pool_current(); + list_add_rcu(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else if (pool) { + /* add the possibly pre-existing pool to the end of the pools + * list; if it's new (and empty) then it'll be removed and + * destroyed by the put after we drop the lock + */ + list_add_tail_rcu(&pool->list, &zswap_pools); + put_pool = pool; + } + + spin_unlock(&zswap_pools_lock); + + if (!zswap_has_pool && !pool) { + /* if initial pool creation failed, and this pool creation also + * failed, maybe both compressor and zpool params were bad. + * Allow changing this param, so pool creation will succeed + * when the other param is changed. We already verified this + * param is ok in the zpool_has_pool() or crypto_has_acomp() + * checks above. + */ + ret = param_set_charp(s, kp); + } + + /* drop the ref from either the old current pool, + * or the new pool we failed to add + */ + if (put_pool) + zswap_pool_put(put_pool); + + return ret; +} + +static int zswap_compressor_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, zswap_zpool_type, NULL); +} + +static int zswap_zpool_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, NULL, zswap_compressor); +} + +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + if (zswap_init_failed) { + pr_err("can't enable, initialization failed\n"); + return -ENODEV; + } + if (!zswap_has_pool && zswap_init_started) { + pr_err("can't enable, no pool configured\n"); + return -ENODEV; + } + + return param_set_bool(val, kp); +} + +/********************************* +* writeback code +**********************************/ +/* return enum for zswap_get_swap_cache_page */ +enum zswap_get_swap_ret { + ZSWAP_SWAPCACHE_NEW, + ZSWAP_SWAPCACHE_EXIST, + ZSWAP_SWAPCACHE_FAIL, +}; + +/* + * zswap_get_swap_cache_page + * + * This is an adaption of read_swap_cache_async() + * + * This function tries to find a page with the given swap entry + * in the swapper_space address space (the swap cache). If the page + * is found, it is returned in retpage. Otherwise, a page is allocated, + * added to the swap cache, and returned in retpage. + * + * If success, the swap cache page is returned in retpage + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error + */ +static int zswap_get_swap_cache_page(swp_entry_t entry, + struct page **retpage) +{ + bool page_was_allocated; + + *retpage = __read_swap_cache_async(entry, GFP_KERNEL, + NULL, 0, &page_was_allocated); + if (page_was_allocated) + return ZSWAP_SWAPCACHE_NEW; + if (!*retpage) + return ZSWAP_SWAPCACHE_FAIL; + return ZSWAP_SWAPCACHE_EXIST; +} + +/* + * Attempts to free an entry by adding a page to the swap cache, + * decompressing the entry data into the page, and issuing a + * bio write to write the page back to the swap device. + * + * This can be thought of as a "resumed writeback" of the page + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the frontswap_store() + * in the first place. After the page has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) +{ + struct zswap_header *zhdr; + swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + + u8 *src, *tmp = NULL; + unsigned int dlen; + int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + if (!zpool_can_sleep_mapped(pool)) { + tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + } + + /* extract swpentry from data */ + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + swpentry = zhdr->swpentry; /* here */ + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + zpool_unmap_handle(pool, handle); + kfree(tmp); + return 0; + } + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); + + src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, handle); + } + + /* try to allocate swap cache page */ + switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + ret = -ENOMEM; + goto fail; + + case ZSWAP_SWAPCACHE_EXIST: + /* page is already in the swap cache, ignore for now */ + put_page(page); + ret = -EEXIST; + goto fail; + + case ZSWAP_SWAPCACHE_NEW: /* page is locked */ + /* + * Having a local reference to the zswap entry doesn't exclude + * swapping from invalidating and recycling the swap slot. Once + * the swapcache is secured against concurrent swapping to and + * from the slot, recheck that the entry is still current before + * writing. + */ + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, entry->offset) != entry) { + spin_unlock(&tree->lock); + delete_from_swap_cache(page_folio(page)); + ret = -ENOMEM; + goto fail; + } + spin_unlock(&tree->lock); + + /* decompress */ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + dlen = PAGE_SIZE; + + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + mutex_unlock(acomp_ctx->mutex); + + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* page is up to date */ + SetPageUptodate(page); + } + + /* move it to the tail of the inactive list after end_writeback */ + SetPageReclaim(page); + + /* start writeback */ + __swap_writepage(page, &wbc); + put_page(page); + zswap_written_back_pages++; + + spin_lock(&tree->lock); + /* drop local reference */ + zswap_entry_put(tree, entry); + + /* + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + + goto end; + + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may be happening concurrently. + * it is safe and okay to not free the entry. + * if we free the entry in the following put + * it is also okay to return !0 + */ +fail: + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + +end: + if (zpool_can_sleep_mapped(pool)) + zpool_unmap_handle(pool, handle); + else + kfree(tmp); + + return ret; +} + +static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos] != page[0]) + return 0; + } + *value = page[0]; + return 1; +} + +static void zswap_fill_page(void *ptr, unsigned long value) +{ + unsigned long *page; + + page = (unsigned long *)ptr; + memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); +} + +/********************************* +* frontswap hooks +**********************************/ +/* attempts to compress and store an single page */ +static int zswap_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *dupentry; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + struct obj_cgroup *objcg = NULL; + struct zswap_pool *pool; + int ret; + unsigned int hlen, dlen = PAGE_SIZE; + unsigned long handle, value; + char *buf; + u8 *src, *dst; + struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; + gfp_t gfp; + + /* THP isn't supported */ + if (PageTransHuge(page)) { + ret = -EINVAL; + goto reject; + } + + if (!zswap_enabled || !tree) { + ret = -ENODEV; + goto reject; + } + + /* + * XXX: zswap reclaim does not work with cgroups yet. Without a + * cgroup-aware entry LRU, we will push out entries system-wide based on + * local cgroup limits. + */ + objcg = get_obj_cgroup_from_page(page); + if (objcg && !obj_cgroup_may_zswap(objcg)) { + ret = -ENOMEM; + goto reject; + } + + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + zswap_pool_reached_full = true; + goto shrink; + } + + if (zswap_pool_reached_full) { + if (!zswap_can_accept()) { + ret = -ENOMEM; + goto reject; + } else + zswap_pool_reached_full = false; + } + + /* allocate entry */ + entry = zswap_entry_cache_alloc(GFP_KERNEL); + if (!entry) { + zswap_reject_kmemcache_fail++; + ret = -ENOMEM; + goto reject; + } + + if (zswap_same_filled_pages_enabled) { + src = kmap_atomic(page); + if (zswap_is_page_same_filled(src, &value)) { + kunmap_atomic(src); + entry->offset = offset; + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto insert_entry; + } + kunmap_atomic(src); + } + + if (!zswap_non_same_filled_pages_enabled) { + ret = -EINVAL; + goto freepage; + } + + /* if entry is successfully added, it keeps the reference */ + entry->pool = zswap_pool_current_get(); + if (!entry->pool) { + ret = -EINVAL; + goto freepage; + } + + /* compress */ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(acomp_ctx->mutex); + + dst = acomp_ctx->dstmem; + sg_init_table(&input, 1); + sg_set_page(&input, page, PAGE_SIZE, 0); + + /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, frontswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing frontswap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + + if (ret) { + ret = -EINVAL; + goto put_dstmem; + } + + /* store */ + hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; + gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + if (zpool_malloc_support_movable(entry->pool->zpool)) + gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto put_dstmem; + } + if (ret) { + zswap_reject_alloc_fail++; + goto put_dstmem; + } + buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO); + memcpy(buf, &zhdr, hlen); + memcpy(buf + hlen, dst, dlen); + zpool_unmap_handle(entry->pool->zpool, handle); + mutex_unlock(acomp_ctx->mutex); + + /* populate entry */ + entry->offset = offset; + entry->handle = handle; + entry->length = dlen; + +insert_entry: + entry->objcg = objcg; + if (objcg) { + obj_cgroup_charge_zswap(objcg, entry->length); + /* Account before objcg ref is moved to tree */ + count_objcg_event(objcg, ZSWPOUT); + } + + /* map */ + spin_lock(&tree->lock); + do { + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); + if (ret == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); + } + } while (ret == -EEXIST); + spin_unlock(&tree->lock); + + /* update stats */ + atomic_inc(&zswap_stored_pages); + zswap_update_total_size(); + count_vm_event(ZSWPOUT); + + return 0; + +put_dstmem: + mutex_unlock(acomp_ctx->mutex); + zswap_pool_put(entry->pool); +freepage: + zswap_entry_cache_free(entry); +reject: + if (objcg) + obj_cgroup_put(objcg); + return ret; + +shrink: + pool = zswap_pool_last_get(); + if (pool) + queue_work(shrink_wq, &pool->shrink_work); + ret = -ENOMEM; + goto reject; +} + +/* + * returns 0 if the page was successfully decompressed + * return -1 on entry not found or error +*/ +static int zswap_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + u8 *src, *dst, *tmp; + unsigned int dlen; + int ret; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_entry_find_get(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return -1; + } + spin_unlock(&tree->lock); + + if (!entry->length) { + dst = kmap_atomic(page); + zswap_fill_page(dst, entry->value); + kunmap_atomic(dst); + ret = 0; + goto stats; + } + + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + tmp = kmalloc(entry->length, GFP_ATOMIC); + if (!tmp) { + ret = -ENOMEM; + goto freeentry; + } + } + + /* decompress */ + dlen = PAGE_SIZE; + src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + if (zpool_evictable(entry->pool->zpool)) + src += sizeof(struct zswap_header); + + if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(entry->pool->zpool, entry->handle); + } + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + mutex_unlock(acomp_ctx->mutex); + + if (zpool_can_sleep_mapped(entry->pool->zpool)) + zpool_unmap_handle(entry->pool->zpool, entry->handle); + else + kfree(tmp); + + BUG_ON(ret); +stats: + count_vm_event(ZSWPIN); + if (entry->objcg) + count_objcg_event(entry->objcg, ZSWPIN); +freeentry: + spin_lock(&tree->lock); + zswap_entry_put(tree, entry); + spin_unlock(&tree->lock); + + return ret; +} + +/* frees an entry in zswap */ +static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return; + } + + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, entry); + + /* drop the initial reference from entry creation */ + zswap_entry_put(tree, entry); + + spin_unlock(&tree->lock); +} + +/* frees all zswap entries for the given swap type */ +static void zswap_frontswap_invalidate_area(unsigned type) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *n; + + if (!tree) + return; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(entry); + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); + kfree(tree); + zswap_trees[type] = NULL; +} + +static void zswap_frontswap_init(unsigned type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; +} + +static const struct frontswap_ops zswap_frontswap_ops = { + .store = zswap_frontswap_store, + .load = zswap_frontswap_load, + .invalidate_page = zswap_frontswap_invalidate_page, + .invalidate_area = zswap_frontswap_invalidate_area, + .init = zswap_frontswap_init +}; + +/********************************* +* debugfs functions +**********************************/ +#ifdef CONFIG_DEBUG_FS +#include + +static struct dentry *zswap_debugfs_root; + +static int __init zswap_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zswap_debugfs_root = debugfs_create_dir("zswap", NULL); + + debugfs_create_u64("pool_limit_hit", 0444, + zswap_debugfs_root, &zswap_pool_limit_hit); + debugfs_create_u64("reject_reclaim_fail", 0444, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", 0444, + zswap_debugfs_root, &zswap_reject_alloc_fail); + debugfs_create_u64("reject_kmemcache_fail", 0444, + zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_poor", 0444, + zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("written_back_pages", 0444, + zswap_debugfs_root, &zswap_written_back_pages); + debugfs_create_u64("duplicate_entry", 0444, + zswap_debugfs_root, &zswap_duplicate_entry); + debugfs_create_u64("pool_total_size", 0444, + zswap_debugfs_root, &zswap_pool_total_size); + debugfs_create_atomic_t("stored_pages", 0444, + zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_atomic_t("same_filled_pages", 0444, + zswap_debugfs_root, &zswap_same_filled_pages); + + return 0; +} +#else +static int __init zswap_debugfs_init(void) +{ + return 0; +} +#endif + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + struct zswap_pool *pool; + int ret; + + zswap_init_started = true; + + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto cache_fail; + } + + ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare", + zswap_dstmem_prepare, zswap_dstmem_dead); + if (ret) { + pr_err("dstmem alloc failed\n"); + goto dstmem_fail; + } + + ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE, + "mm/zswap_pool:prepare", + zswap_cpu_comp_prepare, + zswap_cpu_comp_dead); + if (ret) + goto hp_fail; + + pool = __zswap_pool_create_fallback(); + if (pool) { + pr_info("loaded using pool %s/%s\n", pool->tfm_name, + zpool_get_type(pool->zpool)); + list_add(&pool->list, &zswap_pools); + zswap_has_pool = true; + } else { + pr_err("pool creation failed\n"); + zswap_enabled = false; + } + + shrink_wq = create_workqueue("zswap-shrink"); + if (!shrink_wq) + goto fallback_fail; + + ret = frontswap_register_ops(&zswap_frontswap_ops); + if (ret) + goto destroy_wq; + if (zswap_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + return 0; + +destroy_wq: + destroy_workqueue(shrink_wq); +fallback_fail: + if (pool) + zswap_pool_destroy(pool); +hp_fail: + cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE); +dstmem_fail: + zswap_entry_cache_destroy(); +cache_fail: + /* if built-in, we aren't unloaded on failure; don't allow use */ + zswap_init_failed = true; + zswap_enabled = false; + return -ENOMEM; +} +/* must be late so crypto has time to come up */ +late_initcall(init_zswap); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings "); +MODULE_DESCRIPTION("Compressed cache for swap pages"); -- cgit v1.2.3